#include "../ggml-common.h"
-
-void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclTensor ** acl_src0,
- aclTensor ** acl_src1, aclTensor ** acl_dst) {
+void bcast_shape(ggml_tensor * src0,
+ ggml_tensor * src1,
+ ggml_tensor * dst,
+ aclTensor ** acl_src0,
+ aclTensor ** acl_src1,
+ aclTensor ** acl_dst) {
GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_can_repeat(src1, src0));
// Need bcast
if (!ggml_are_same_shape(src0, src1) && ggml_cann_need_bcast(src0, src1)) {
}
}
-void ggml_cann_op_unary(
- std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
- ggml_backend_cann_context& ctx, ggml_tensor* dst) {
- ggml_tensor* src = dst->src[0];
+void ggml_cann_op_unary(std::function<void(ggml_backend_cann_context &, aclTensor *, aclTensor *)> unary_op,
+ ggml_backend_cann_context & ctx,
+ ggml_tensor * dst) {
+ ggml_tensor * src = dst->src[0];
- aclTensor* acl_src = ggml_cann_create_tensor(src);
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+ aclTensor * acl_src = ggml_cann_create_tensor(src);
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst);
unary_op(ctx, acl_src, acl_dst);
ggml_cann_release_resources(ctx, acl_src, acl_dst);
}
-void ggml_cann_op_unary_gated(
- std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
- ggml_backend_cann_context& ctx, ggml_tensor* dst) {
- ggml_tensor* src0 = dst->src[0];
- ggml_tensor* src1 = dst->src[1];
+void ggml_cann_op_unary_gated(std::function<void(ggml_backend_cann_context &, aclTensor *, aclTensor *)> unary_op,
+ ggml_backend_cann_context & ctx,
+ ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0];
+ ggml_tensor * src1 = dst->src[1];
GGML_ASSERT(ggml_is_contiguous_1(src0));
GGML_ASSERT(ggml_is_contiguous_1(dst));
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
- aclTensor *acl_src0 = nullptr, *acl_src1 = nullptr;
- if(src1) {
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst);
+ aclTensor * acl_src0 = nullptr, *acl_src1 = nullptr;
+ if (src1) {
GGML_ASSERT(ggml_is_contiguous_1(src1));
GGML_ASSERT(src0->type == src1->type);
acl_src0 = ggml_cann_create_tensor(src0);
acl_src1 = ggml_cann_create_tensor(src1);
} else {
- int64_t ne[] = {src0->ne[0] / 2, src0->ne[1], src0->ne[2], src0->ne[3]};
- size_t nb[] = {src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]};
- acl_src0 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, 0);
+ int64_t ne[] = { src0->ne[0] / 2, src0->ne[1], src0->ne[2], src0->ne[3] };
+ size_t nb[] = { src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3] };
+ acl_src0 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, 0);
acl_src1 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, ne[0] * ggml_element_size(src0));
if (swapped) {
std::swap(acl_src0, acl_src1);
* @param repeat_array The array specifying the number of repetitions along each
* dimension.
*/
-static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src,
- aclTensor* acl_dst, int64_t* repeat_array) {
+static void aclnn_repeat(ggml_backend_cann_context & ctx,
+ aclTensor * acl_src,
+ aclTensor * acl_dst,
+ int64_t * repeat_array) {
// repeat tensor along each dim with repeat_array
- aclIntArray* repeats = aclCreateIntArray(repeat_array, GGML_MAX_DIMS);
+ aclIntArray * repeats = aclCreateIntArray(repeat_array, GGML_MAX_DIMS);
GGML_CANN_CALL_ACLNN_OP(ctx, Repeat, acl_src, repeats, acl_dst);
ggml_cann_release_resources(ctx, repeats);
* @param cast_data_type The target data type to which the source tensor will be
* casted.
*/
-static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src,
- aclTensor* acl_dst, aclDataType cast_data_type) {
+static void aclnn_cast(ggml_backend_cann_context & ctx,
+ aclTensor * acl_src,
+ aclTensor * acl_dst,
+ aclDataType cast_data_type) {
GGML_CANN_CALL_ACLNN_OP(ctx, Cast, acl_src, cast_data_type, acl_dst);
}
-void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
- ggml_tensor* src = dst->src[0];
+void ggml_cann_repeat(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src = dst->src[0];
GGML_ASSERT(ggml_can_repeat(src, dst));
- aclTensor* acl_src = ggml_cann_create_tensor(src);
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+ aclTensor * acl_src = ggml_cann_create_tensor(src);
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst);
- int64_t repeatsArray[] = {dst->ne[3] / src->ne[3], dst->ne[2] / src->ne[2],
- dst->ne[1] / src->ne[1], dst->ne[0] / src->ne[0]};
+ int64_t repeatsArray[] = { dst->ne[3] / src->ne[3], dst->ne[2] / src->ne[2], dst->ne[1] / src->ne[1],
+ dst->ne[0] / src->ne[0] };
aclnn_repeat(ctx, acl_src, acl_dst, repeatsArray);
ggml_cann_release_resources(ctx, acl_src, acl_dst);
}
-void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
- aclTensor* acl_src1, aclTensor* acl_dst) {
- float alphaValue = 1.0f;
- aclScalar* alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
- if (acl_dst != nullptr)
+void aclnn_add(ggml_backend_cann_context & ctx, aclTensor * acl_src0, aclTensor * acl_src1, aclTensor * acl_dst) {
+ float alphaValue = 1.0f;
+ aclScalar * alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
+ if (acl_dst != nullptr) {
GGML_CANN_CALL_ACLNN_OP(ctx, Add, acl_src0, acl_src1, alpha, acl_dst);
- else
+ } else {
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_src0, acl_src1, alpha);
+ }
ggml_cann_release_resources(ctx, alpha);
}
-void aclnn_sub(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
- aclTensor* acl_src1, aclTensor* acl_dst) {
- float alphaValue = 1.0f;
- aclScalar* alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
- if (acl_dst != nullptr)
+void aclnn_sub(ggml_backend_cann_context & ctx, aclTensor * acl_src0, aclTensor * acl_src1, aclTensor * acl_dst) {
+ float alphaValue = 1.0f;
+ aclScalar * alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
+ if (acl_dst != nullptr) {
GGML_CANN_CALL_ACLNN_OP(ctx, Sub, acl_src0, acl_src1, alpha, acl_dst);
- else
+ } else {
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSub, acl_src0, acl_src1, alpha);
+ }
ggml_cann_release_resources(ctx, alpha);
}
-void aclnn_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src,
- aclTensor* acl_other, aclTensor* acl_dst) {
- if (acl_dst != nullptr)
+void aclnn_mul(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_other, aclTensor * acl_dst) {
+ if (acl_dst != nullptr) {
GGML_CANN_CALL_ACLNN_OP(ctx, Mul, acl_src, acl_other, acl_dst);
- else
+ } else {
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, acl_src, acl_other);
+ }
}
-void aclnn_div(ggml_backend_cann_context& ctx, aclTensor* acl_src,
- aclTensor* acl_other, aclTensor* acl_dst) {
- if (acl_dst != nullptr)
+void aclnn_div(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_other, aclTensor * acl_dst) {
+ if (acl_dst != nullptr) {
GGML_CANN_CALL_ACLNN_OP(ctx, Div, acl_src, acl_other, acl_dst);
- else
+ } else {
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceDiv, acl_src, acl_other);
+ }
}
/**
* @param inplace Flag indicating whether to perform the operation in-place on
* `acl_src`.
*/
-static void aclnn_muls(ggml_backend_cann_context& ctx, aclTensor* acl_src,
- float scale, aclTensor* acl_dst, bool inplace) {
- aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
+static void aclnn_muls(ggml_backend_cann_context & ctx,
+ aclTensor * acl_src,
+ float scale,
+ aclTensor * acl_dst,
+ bool inplace) {
+ aclScalar * acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
if (inplace) {
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_src, acl_scale);
} else {
ggml_cann_release_resources(ctx, acl_scale);
}
-void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
- ggml_tensor* src = dst->src[0];
+void ggml_cann_leaky_relu(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src = dst->src[0];
GGML_ASSERT(src->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
- aclTensor* acl_src = ggml_cann_create_tensor(src);
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+ aclTensor * acl_src = ggml_cann_create_tensor(src);
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst);
float negative_slope;
memcpy(&negative_slope, dst->op_params, sizeof(float));
- aclScalar* acl_negative_slope =
- aclCreateScalar(&negative_slope, aclDataType::ACL_FLOAT);
+ aclScalar * acl_negative_slope = aclCreateScalar(&negative_slope, aclDataType::ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, LeakyRelu, acl_src, acl_negative_slope, acl_dst);
ggml_cann_release_resources(ctx, acl_negative_slope, acl_src, acl_dst);
* stored.
* @param concat_dim The dimension along which the tensors will be concatenated.
*/
-static void aclnn_concat(ggml_backend_cann_context& ctx,
- aclTensorList* tensorList, aclTensor* acl_dst,
- int64_t concat_dim) {
+static void aclnn_concat(ggml_backend_cann_context & ctx,
+ aclTensorList * tensorList,
+ aclTensor * acl_dst,
+ int64_t concat_dim) {
GGML_CANN_CALL_ACLNN_OP(ctx, Cat, tensorList, concat_dim, acl_dst);
}
-void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
- ggml_tensor* src0 = dst->src[0];
- ggml_tensor* src1 = dst->src[1];
- aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
- aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+void ggml_cann_concat(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0];
+ ggml_tensor * src1 = dst->src[1];
+ aclTensor * acl_src0 = ggml_cann_create_tensor(src0);
+ aclTensor * acl_src1 = ggml_cann_create_tensor(src1);
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst);
const int32_t dim = ggml_get_op_params_i32(dst, 0);
GGML_ASSERT(dim >= 0 && dim < 4);
int32_t acl_dim = 3 - dim;
- aclTensor* tensors[] = {acl_src0, acl_src1};
- aclTensorList* tensor_list = aclCreateTensorList(tensors, 2);
+ aclTensor * tensors[] = { acl_src0, acl_src1 };
+ aclTensorList * tensor_list = aclCreateTensorList(tensors, 2);
aclnn_concat(ctx, tensor_list, acl_dst, acl_dim);
ggml_cann_release_resources(ctx, tensor_list, acl_dst);
* @param step The step size between consecutive values.
* @param n_elements The number of elements in the destination tensor.
*/
-static void aclnn_arange(ggml_backend_cann_context& ctx, aclTensor* acl_dst,
- float start, float stop, float step,
- int64_t n_elements) {
- int64_t steps = (int64_t)std::ceil((stop - start) / step);
+static void aclnn_arange(ggml_backend_cann_context & ctx,
+ aclTensor * acl_dst,
+ float start,
+ float stop,
+ float step,
+ int64_t n_elements) {
+ int64_t steps = (int64_t) std::ceil((stop - start) / step);
GGML_ASSERT(n_elements == steps);
- aclScalar* acl_start = aclCreateScalar(&start, aclDataType::ACL_FLOAT);
- aclScalar* acl_end = aclCreateScalar(&stop, aclDataType::ACL_FLOAT);
- aclScalar* acl_step = aclCreateScalar(&step, aclDataType::ACL_FLOAT);
+ aclScalar * acl_start = aclCreateScalar(&start, aclDataType::ACL_FLOAT);
+ aclScalar * acl_end = aclCreateScalar(&stop, aclDataType::ACL_FLOAT);
+ aclScalar * acl_step = aclCreateScalar(&step, aclDataType::ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, Arange, acl_start, acl_end, acl_step, acl_dst);
ggml_cann_release_resources(ctx, acl_start, acl_end, acl_step);
}
-void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+void ggml_cann_arange(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
GGML_ASSERT(dst->type == GGML_TYPE_F32);
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst);
int64_t n_elements = ggml_nelements(dst);
- float start;
- float stop;
- float step;
- memcpy(&start, (float*)dst->op_params + 0, sizeof(float));
- memcpy(&stop, (float*)dst->op_params + 1, sizeof(float));
- memcpy(&step, (float*)dst->op_params + 2, sizeof(float));
+ float start;
+ float stop;
+ float step;
+ memcpy(&start, (float *) dst->op_params + 0, sizeof(float));
+ memcpy(&stop, (float *) dst->op_params + 1, sizeof(float));
+ memcpy(&step, (float *) dst->op_params + 2, sizeof(float));
aclnn_arange(ctx, acl_dst, start, stop, step, n_elements);
ggml_cann_release_resources(ctx, acl_dst);
}
-void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
- ggml_tensor* src = dst->src[0];
+void ggml_cann_clamp(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src = dst->src[0];
float min;
float max;
memcpy(&min, dst->op_params, sizeof(float));
- memcpy(&max, (float*)dst->op_params + 1, sizeof(float));
+ memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
- aclTensor* acl_src = ggml_cann_create_tensor(src);
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+ aclTensor * acl_src = ggml_cann_create_tensor(src);
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst);
- aclScalar* acl_min = aclCreateScalar(&min, aclDataType::ACL_FLOAT);
- aclScalar* acl_max = aclCreateScalar(&max, aclDataType::ACL_FLOAT);
+ aclScalar * acl_min = aclCreateScalar(&min, aclDataType::ACL_FLOAT);
+ aclScalar * acl_max = aclCreateScalar(&max, aclDataType::ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, Clamp, acl_src, acl_min, acl_max, acl_dst);
ggml_cann_release_resources(ctx, acl_min, acl_max, acl_src, acl_dst);
}
-void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
- ggml_tensor* src = dst->src[0];
+void ggml_cann_scale(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src = dst->src[0];
// scale factor
float v;
memcpy(&v, dst->op_params, sizeof(float));
- aclScalar* scale = aclCreateScalar(&v, aclDataType::ACL_FLOAT);
- aclTensor* acl_src = ggml_cann_create_tensor(src);
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+ aclScalar * scale = aclCreateScalar(&v, aclDataType::ACL_FLOAT);
+ aclTensor * acl_src = ggml_cann_create_tensor(src);
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst);
GGML_CANN_CALL_ACLNN_OP(ctx, Muls, acl_src, scale, acl_dst);
ggml_cann_release_resources(ctx, scale, acl_src, acl_dst);
}
-void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
- ggml_tensor* src = dst->src[0];
- enum ggml_sort_order order = (enum ggml_sort_order)dst->op_params[0];
-
- aclTensor* acl_src = ggml_cann_create_tensor(src);
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
- ggml_cann_pool_alloc temp_buffer_allocator(
- ctx.pool(), ggml_nelements(dst) * sizeof(int64_t));
- void* buffer = temp_buffer_allocator.get();
- aclTensor* tmp_tensor =
- ggml_cann_create_tensor(buffer, ACL_INT64, ggml_type_size(dst->type),
- dst->ne, dst->nb, GGML_MAX_DIMS);
- GGML_CANN_CALL_ACLNN_OP(ctx, Argsort, acl_src, -1, (order == GGML_SORT_ORDER_DESC ? true : false),
- tmp_tensor);
+void ggml_cann_argsort(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src = dst->src[0];
+ enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
+
+ aclTensor * acl_src = ggml_cann_create_tensor(src);
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst);
+ ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(int64_t));
+ void * buffer = temp_buffer_allocator.get();
+ aclTensor * tmp_tensor =
+ ggml_cann_create_tensor(buffer, ACL_INT64, ggml_type_size(dst->type), dst->ne, dst->nb, GGML_MAX_DIMS);
+ GGML_CANN_CALL_ACLNN_OP(ctx, Argsort, acl_src, -1, (order == GGML_SORT_ORDER_DESC ? true : false), tmp_tensor);
GGML_CANN_CALL_ACLNN_OP(ctx, Cast, tmp_tensor, ggml_cann_type_mapping(dst->type), acl_dst);
ggml_cann_release_resources(ctx, acl_src, tmp_tensor, acl_dst);
}
-void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
- ggml_tensor* src = dst->src[0];
+void ggml_cann_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src = dst->src[0];
- aclTensor* acl_src = ggml_cann_create_tensor(src);
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+ aclTensor * acl_src = ggml_cann_create_tensor(src);
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst);
float eps;
memcpy(&eps, dst->op_params, sizeof(float));
- std::vector<int64_t> normData = {dst->ne[0]};
- aclIntArray* norm = aclCreateIntArray(normData.data(), normData.size());
- GGML_CANN_CALL_ACLNN_OP(ctx, LayerNorm, acl_src, norm, nullptr, nullptr,
- eps, acl_dst, nullptr, nullptr);
+ std::vector<int64_t> normData = { dst->ne[0] };
+ aclIntArray * norm = aclCreateIntArray(normData.data(), normData.size());
+ GGML_CANN_CALL_ACLNN_OP(ctx, LayerNorm, acl_src, norm, nullptr, nullptr, eps, acl_dst, nullptr, nullptr);
ggml_cann_release_resources(ctx, norm, acl_src, acl_dst);
}
-void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
- ggml_tensor* src = dst->src[0];
+void ggml_cann_group_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src = dst->src[0];
- aclTensor* acl_src = ggml_cann_create_tensor(src);
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+ aclTensor * acl_src = ggml_cann_create_tensor(src);
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst);
int n_groups = dst->op_params[0];
float eps;
memcpy(&eps, dst->op_params + 1, sizeof(float));
- int64_t N = src->ne[3];
- int64_t C = src->ne[2];
+ int64_t N = src->ne[3];
+ int64_t C = src->ne[2];
int64_t HxW = src->ne[1] * src->ne[0];
- size_t type_size = ggml_type_size(src->type);
- int64_t ne[] = {n_groups, N};
- size_t nb[] = {type_size, type_size * n_groups};
- size_t n_bytes = N * n_groups;
+ size_t type_size = ggml_type_size(src->type);
+ int64_t ne[] = { n_groups, N };
+ size_t nb[] = { type_size, type_size * n_groups };
+ size_t n_bytes = N * n_groups;
ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), n_bytes * 2);
- void* buffer = temp_buffer_allocator.get();
- aclTensor* acl_mean_out = ggml_cann_create_tensor(
- buffer, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
- aclTensor* acl_rstd_out = ggml_cann_create_tensor(
- (char*)buffer + n_bytes, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
-
- GGML_CANN_CALL_ACLNN_OP(ctx, GroupNorm, acl_src, nullptr, nullptr, N, C, HxW, n_groups, eps,
- acl_dst, acl_mean_out, acl_rstd_out);
+ void * buffer = temp_buffer_allocator.get();
+ aclTensor * acl_mean_out = ggml_cann_create_tensor(buffer, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
+ aclTensor * acl_rstd_out =
+ ggml_cann_create_tensor((char *) buffer + n_bytes, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
+
+ GGML_CANN_CALL_ACLNN_OP(ctx, GroupNorm, acl_src, nullptr, nullptr, N, C, HxW, n_groups, eps, acl_dst, acl_mean_out,
+ acl_rstd_out);
ggml_cann_release_resources(ctx, acl_src, acl_dst, acl_mean_out, acl_rstd_out);
}
-void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
- ggml_tensor* src0 = dst->src[0];
- ggml_tensor* src1 = dst->src[1];
+void ggml_cann_acc(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0];
+ ggml_tensor * src1 = dst->src[1];
- size_t nb1 = ((int32_t*)dst->op_params)[0];
- size_t nb2 = ((int32_t*)dst->op_params)[1];
- size_t nb3 = ((int32_t*)dst->op_params)[2];
- size_t offset = ((int32_t*)dst->op_params)[3];
- bool inplace = (bool)((int32_t*)dst->op_params)[4];
+ size_t nb1 = ((int32_t *) dst->op_params)[0];
+ size_t nb2 = ((int32_t *) dst->op_params)[1];
+ size_t nb3 = ((int32_t *) dst->op_params)[2];
+ size_t offset = ((int32_t *) dst->op_params)[3];
+ bool inplace = (bool) ((int32_t *) dst->op_params)[4];
- size_t param_nb[] = {ggml_element_size(src0), nb1, nb2, nb3};
+ size_t param_nb[] = { ggml_element_size(src0), nb1, nb2, nb3 };
- aclTensor* acl_dst = ggml_cann_create_tensor(
- dst, src1->ne, param_nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
- aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst, src1->ne, param_nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
+ aclTensor * acl_src1 = ggml_cann_create_tensor(src1);
- aclScalar* alpha = nullptr;
- float alphaValue = 1.0f;
- alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
+ aclScalar * alpha = nullptr;
+ float alphaValue = 1.0f;
+ alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
if (!inplace) {
size_t cpy_size = ggml_nbytes(dst);
- ggml_cann_async_memcpy(ctx, dst->data, src0->data, cpy_size,
- ACL_MEMCPY_DEVICE_TO_DEVICE);
- aclTensor* acl_src0 = ggml_cann_create_tensor(
- src0, src1->ne, src0->nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
+ ggml_cann_async_memcpy(ctx, dst->data, src0->data, cpy_size, ACL_MEMCPY_DEVICE_TO_DEVICE);
+ aclTensor * acl_src0 = ggml_cann_create_tensor(src0, src1->ne, src0->nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
GGML_CANN_CALL_ACLNN_OP(ctx, Add, acl_src0, acl_src1, alpha, acl_dst);
ggml_cann_release_resources(ctx, acl_src0);
* @param dim An array of dimension indices.
* @param dim_size The number of dimensions.
*/
-static void aclnn_reduce_sum(ggml_backend_cann_context& ctx, ggml_tensor* dst,
- int64_t* dim, size_t dim_size) {
+static void aclnn_reduce_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst, int64_t * dim, size_t dim_size) {
GGML_ASSERT(dst->ne[0] == 1);
- ggml_tensor* src = dst->src[0];
- aclTensor* acl_src = ggml_cann_create_tensor(src);
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
- aclIntArray* reduce_dims = aclCreateIntArray(dim, dim_size);
+ ggml_tensor * src = dst->src[0];
+ aclTensor * acl_src = ggml_cann_create_tensor(src);
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst);
+ aclIntArray * reduce_dims = aclCreateIntArray(dim, dim_size);
- GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_src, reduce_dims, true,
- ggml_cann_type_mapping(dst->type), acl_dst);
+ GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_src, reduce_dims, true, ggml_cann_type_mapping(dst->type), acl_dst);
ggml_cann_release_resources(ctx, acl_src, acl_dst, reduce_dims);
}
-void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
- int64_t reduce_dims[] = {3};
+void ggml_cann_sum_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ int64_t reduce_dims[] = { 3 };
aclnn_reduce_sum(ctx, dst, reduce_dims, 1);
}
-void ggml_cann_sum(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
- int64_t reduce_dims[] = {0, 1, 2, 3};
+void ggml_cann_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ int64_t reduce_dims[] = { 0, 1, 2, 3 };
aclnn_reduce_sum(ctx, dst, reduce_dims, 4);
}
-void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
- ggml_tensor* dst) {
- ggml_tensor* src = dst->src[0];
- aclTensor* acl_src =
- ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
- aclTensor* acl_dst =
- ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
+void ggml_cann_upsample_nearest2d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src = dst->src[0];
+ aclTensor * acl_src = ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
- std::vector<int64_t> output_size{dst->ne[1], dst->ne[0]};
- auto output_size_array = aclCreateIntArray(output_size.data(), 2);
+ std::vector<int64_t> output_size{ dst->ne[1], dst->ne[0] };
+ auto output_size_array = aclCreateIntArray(output_size.data(), 2);
GGML_CANN_CALL_ACLNN_OP(ctx, UpsampleNearest2d, acl_src, output_size_array, acl_dst);
ggml_cann_release_resources(ctx, acl_src, acl_dst, output_size_array);
* The size of the array should be twice the number of dimensions of the tensor.
* @param value The value to be used for padding. The default value is 0.0.
*/
-static void aclnn_pad(ggml_backend_cann_context& ctx, aclTensor* acl_src,
- aclTensor* acl_dst, int64_t* paddings,
- float value = 0.0f) {
- aclIntArray* acl_pad = aclCreateIntArray(paddings, GGML_MAX_DIMS * 2);
- aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
+static void aclnn_pad(ggml_backend_cann_context & ctx,
+ aclTensor * acl_src,
+ aclTensor * acl_dst,
+ int64_t * paddings,
+ float value = 0.0f) {
+ aclIntArray * acl_pad = aclCreateIntArray(paddings, GGML_MAX_DIMS * 2);
+ aclScalar * acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, ConstantPadNd, acl_src, acl_pad, acl_value, acl_dst);
ggml_cann_release_resources(ctx, acl_pad, acl_value);
}
-void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
- ggml_tensor* src = dst->src[0];
- aclTensor* acl_src = ggml_cann_create_tensor(src);
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+void ggml_cann_pad(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src = dst->src[0];
+ aclTensor * acl_src = ggml_cann_create_tensor(src);
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst);
// padding: value in the array means how much distance will be padding.
// the position of elements in the array means which dirction to padding,
const int32_t lp3 = ggml_get_op_params_i32(dst, 6);
const int32_t rp3 = ggml_get_op_params_i32(dst, 7);
- int64_t paddings[] = {lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3};
+ int64_t paddings[] = { lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3 };
aclnn_pad(ctx, acl_src, acl_dst, paddings);
ggml_cann_release_resources(ctx, acl_src, acl_dst);
}
* @param dst The destination tensor where the result will be stored. The source
* tensor is referenced by `dst->src[0]`.
*/
-static void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx,
- ggml_tensor* dst) {
- ggml_tensor* src = dst->src[0];
+static void ggml_cann_avg_pool2d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src = dst->src[0];
GGML_ASSERT(src->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
- aclTensor* acl_src =
- ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
- aclTensor* acl_dst =
- ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
-
- const int32_t* opts = (const int32_t*)dst->op_params;
- const int k0 = opts[1];
- const int k1 = opts[2];
- const int s0 = opts[3];
- const int s1 = opts[4];
- const int p0 = opts[5];
- const int p1 = opts[6];
-
- std::vector<int64_t> kernel_dims = {k1, k0};
- std::vector<int64_t> stride_dims = {s1, s0};
- std::vector<int64_t> padding_avg_dims = {p1, p0}; // (padH, padW)
-
- auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
- auto* strides = aclCreateIntArray(stride_dims.data(), 2);
- auto* paddings_avg = aclCreateIntArray(padding_avg_dims.data(), 2);
-
- bool ceil_mode = false;
- bool count_include_pad = true;
- int64_t divisor_override = 0;
- int8_t cube_math_type = 0;
+ aclTensor * acl_src = ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
+
+ const int32_t * opts = (const int32_t *) dst->op_params;
+ const int k0 = opts[1];
+ const int k1 = opts[2];
+ const int s0 = opts[3];
+ const int s1 = opts[4];
+ const int p0 = opts[5];
+ const int p1 = opts[6];
+
+ std::vector<int64_t> kernel_dims = { k1, k0 };
+ std::vector<int64_t> stride_dims = { s1, s0 };
+ std::vector<int64_t> padding_avg_dims = { p1, p0 }; // (padH, padW)
+
+ auto * kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
+ auto * strides = aclCreateIntArray(stride_dims.data(), 2);
+ auto * paddings_avg = aclCreateIntArray(padding_avg_dims.data(), 2);
+
+ bool ceil_mode = false;
+ bool count_include_pad = true;
+ int64_t divisor_override = 0;
+ int8_t cube_math_type = 0;
#ifdef ASCEND_310P
cube_math_type = 1;
#endif
- GGML_CANN_CALL_ACLNN_OP(ctx, AvgPool2d, acl_src, kernel_size, strides, paddings_avg,
- ceil_mode, count_include_pad, divisor_override,
- cube_math_type, acl_dst);
- ggml_cann_release_resources(ctx, acl_src, acl_dst, kernel_size, strides,
- paddings_avg);
+ GGML_CANN_CALL_ACLNN_OP(ctx, AvgPool2d, acl_src, kernel_size, strides, paddings_avg, ceil_mode, count_include_pad,
+ divisor_override, cube_math_type, acl_dst);
+ ggml_cann_release_resources(ctx, acl_src, acl_dst, kernel_size, strides, paddings_avg);
}
/**
* @param dst The destination tensor where the result will be stored. The source
* tensor is referenced by `dst->src[0]`.
*/
-static void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx,
- ggml_tensor* dst) {
- ggml_tensor* src = dst->src[0];
+static void ggml_cann_max_pool2d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src = dst->src[0];
GGML_ASSERT(src->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
- aclTensor* acl_src =
- ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
- aclTensor* acl_dst =
- ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
+ aclTensor * acl_src = ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
- const int32_t* opts = (const int32_t*)dst->op_params;
- const int k0 = opts[1];
- const int k1 = opts[2];
- const int s0 = opts[3];
- const int s1 = opts[4];
- const int p0 = opts[5];
- const int p1 = opts[6];
+ const int32_t * opts = (const int32_t *) dst->op_params;
+ const int k0 = opts[1];
+ const int k1 = opts[2];
+ const int s0 = opts[3];
+ const int s1 = opts[4];
+ const int p0 = opts[5];
+ const int p1 = opts[6];
- int64_t temp_ne[] = {src->ne[0] + p0 * 2, src->ne[1] + p1 * 2, src->ne[2],
- src->ne[3]};
- size_t temp_nb[GGML_MAX_DIMS];
+ int64_t temp_ne[] = { src->ne[0] + p0 * 2, src->ne[1] + p1 * 2, src->ne[2], src->ne[3] };
+ size_t temp_nb[GGML_MAX_DIMS];
temp_nb[0] = ggml_element_size(src);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
temp_nb[i] = temp_nb[i - 1] * temp_ne[i - 1];
}
- ggml_cann_pool_alloc temp_buffer_allocator(
- ctx.pool(), ggml_nbytes(src) + p0 * 2 + p1 * 2 * src->nb[1]);
- void* buffer = temp_buffer_allocator.get();
- aclTensor* tmp_tensor = ggml_cann_create_tensor(
- buffer, ACL_FLOAT, ggml_element_size(src), temp_ne, temp_nb,
- GGML_MAX_DIMS, ACL_FORMAT_NCHW);
+ ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), ggml_nbytes(src) + p0 * 2 + p1 * 2 * src->nb[1]);
+ void * buffer = temp_buffer_allocator.get();
+ aclTensor * tmp_tensor = ggml_cann_create_tensor(buffer, ACL_FLOAT, ggml_element_size(src), temp_ne, temp_nb,
+ GGML_MAX_DIMS, ACL_FORMAT_NCHW);
// pad: see padding in ggml_cann_pad()
- int64_t paddings[] = {p0, p0, p1, p1, 0, 0, 0, 0};
- float value = -FLT_MAX;
+ int64_t paddings[] = { p0, p0, p1, p1, 0, 0, 0, 0 };
+ float value = -FLT_MAX;
aclnn_pad(ctx, acl_src, tmp_tensor, paddings, value);
// max_pool
- std::vector<int64_t> kernel_dims = {k1, k0};
- std::vector<int64_t> stride_dims = {s1, s0};
+ std::vector<int64_t> kernel_dims = { k1, k0 };
+ std::vector<int64_t> stride_dims = { s1, s0 };
// padding_max_dims: [dim0_start, dim0_end, dim1_start, dim1_end]
- std::vector<int64_t> padding_max_dims = {0, 0, 0, 0};
- std::vector<int64_t> dilation_size = {1, 1};
- auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
- auto* strides = aclCreateIntArray(stride_dims.data(), 2);
- auto* paddings_max = aclCreateIntArray(padding_max_dims.data(), 4);
- auto* dilations = aclCreateIntArray(dilation_size.data(), 2);
-
- bool ceil_mode = false;
+ std::vector<int64_t> padding_max_dims = { 0, 0, 0, 0 };
+ std::vector<int64_t> dilation_size = { 1, 1 };
+ auto * kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
+ auto * strides = aclCreateIntArray(stride_dims.data(), 2);
+ auto * paddings_max = aclCreateIntArray(padding_max_dims.data(), 4);
+ auto * dilations = aclCreateIntArray(dilation_size.data(), 2);
+
+ bool ceil_mode = false;
int64_t auto_pads = 0;
- GGML_CANN_CALL_ACLNN_OP(ctx, MaxPool, tmp_tensor, kernel_size, strides, auto_pads,
- paddings_max, dilations, ceil_mode, acl_dst);
- ggml_cann_release_resources(ctx, acl_src, acl_dst, tmp_tensor, kernel_size,
- strides, paddings_max, dilations);
+ GGML_CANN_CALL_ACLNN_OP(ctx, MaxPool, tmp_tensor, kernel_size, strides, auto_pads, paddings_max, dilations,
+ ceil_mode, acl_dst);
+ ggml_cann_release_resources(ctx, acl_src, acl_dst, tmp_tensor, kernel_size, strides, paddings_max, dilations);
}
-void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
- const int32_t* opts = (const int32_t*)dst->op_params;
- enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
+void ggml_cann_pool2d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ const int32_t * opts = (const int32_t *) dst->op_params;
+ enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
switch (op) {
case GGML_OP_POOL_AVG:
ggml_cann_avg_pool2d(ctx, dst);
* @param acl_src The source tensor from which data will be copied.
* @param acl_dst The destination tensor where the data will be copied to.
*/
-static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src,
- aclTensor* acl_dst) {
+static void cann_copy(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceCopy, acl_dst, acl_src);
}
-void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
- ggml_tensor* src0 = dst->src[0];
+void ggml_cann_dup(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0];
if (ggml_are_same_shape(src0, dst)) {
- aclTensor* acl_src = ggml_cann_create_tensor(src0);
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+ aclTensor * acl_src = ggml_cann_create_tensor(src0);
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst);
if (dst->type == src0->type) {
cann_copy(ctx, acl_src, acl_dst);
} else {
}
ggml_cann_release_resources(ctx, acl_src, acl_dst);
} else {
- void* src_trans_buffer = src0->data;
+ void * src_trans_buffer = src0->data;
ggml_cann_pool_alloc src_buffer_allocator;
if (!ggml_is_contiguous(src0)) {
- aclTensor* acl_src = ggml_cann_create_tensor(src0);
- src_buffer_allocator.alloc(ctx.pool(),
- ggml_nelements(src0) * ggml_type_size(src0->type));
+ aclTensor * acl_src = ggml_cann_create_tensor(src0);
+ src_buffer_allocator.alloc(ctx.pool(), ggml_nelements(src0) * ggml_type_size(src0->type));
src_trans_buffer = src_buffer_allocator.get();
size_t src_trans_nb[GGML_MAX_DIMS];
src_trans_nb[0] = ggml_type_size(src0->type);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
}
- aclTensor* src_trans_tensor = ggml_cann_create_tensor(
- src_trans_buffer, ggml_cann_type_mapping(src0->type),
- ggml_type_size(src0->type), src0->ne, src_trans_nb,
- GGML_MAX_DIMS);
+ aclTensor * src_trans_tensor =
+ ggml_cann_create_tensor(src_trans_buffer, ggml_cann_type_mapping(src0->type),
+ ggml_type_size(src0->type), src0->ne, src_trans_nb, GGML_MAX_DIMS);
cann_copy(ctx, acl_src, src_trans_tensor);
ggml_cann_release_resources(ctx, acl_src, src_trans_tensor);
}
src_reshape_nb[i] = src_reshape_nb[i - 1] * dst->ne[i - 1];
}
- aclTensor* trans_acl_src = ggml_cann_create_tensor(src_trans_buffer,
- ggml_cann_type_mapping(src0->type),ggml_type_size(src0->type),
- dst->ne, src_reshape_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+ aclTensor * trans_acl_src =
+ ggml_cann_create_tensor(src_trans_buffer, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type),
+ dst->ne, src_reshape_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst);
if (dst->type == src0->type) {
cann_copy(ctx, trans_acl_src, acl_dst);
* @param type_size The size of each element in the tensor data type.
* @return An ACL tensor initialized with zeros.
*/
-static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
- size_t n_bytes, int64_t* ne, int64_t dims,
- aclDataType type, size_t type_size) {
+static aclTensor * aclnn_zero(ggml_backend_cann_context & ctx,
+ void * buffer,
+ size_t n_bytes,
+ int64_t * ne,
+ int64_t dims,
+ aclDataType type,
+ size_t type_size) {
size_t nb[GGML_MAX_DIMS];
nb[0] = type_size;
for (int i = 1; i < dims; i++) {
nb[i] = nb[i - 1] * ne[i - 1];
}
- aclTensor* zero =
- ggml_cann_create_tensor(buffer, type, type_size, ne, nb, dims);
+ aclTensor * zero = ggml_cann_create_tensor(buffer, type, type_size, ne, nb, dims);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, zero);
return zero;
GGML_UNUSED(n_bytes);
* is 1.0).
* @return An ACL tensor initialized with value.
*/
-static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer,
- size_t n_bytes, int64_t* ne, int64_t dims,
- aclDataType type, size_t type_size,
- float value = 1.0f) {
- aclTensor* acl_tensor =
- aclnn_zero(ctx, buffer, n_bytes, ne, dims, type, type_size);
- float alpha_host = 1.0f;
- aclScalar* alpha = aclCreateScalar(&alpha_host, aclDataType::ACL_FLOAT);
- aclScalar* other = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
+static aclTensor * aclnn_values(ggml_backend_cann_context & ctx,
+ void * buffer,
+ size_t n_bytes,
+ int64_t * ne,
+ int64_t dims,
+ aclDataType type,
+ size_t type_size,
+ float value = 1.0f) {
+ aclTensor * acl_tensor = aclnn_zero(ctx, buffer, n_bytes, ne, dims, type, type_size);
+ float alpha_host = 1.0f;
+ aclScalar * alpha = aclCreateScalar(&alpha_host, aclDataType::ACL_FLOAT);
+ aclScalar * other = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdds, acl_tensor, other, alpha);
return acl_tensor;
}
* @param scalar The scalar value used to fill the tensor.
* @param acl_dst The destination tensor to be filled with the scalar value.
*/
-static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
- aclTensor* acl_dst) {
+static void aclnn_fill_scalar(ggml_backend_cann_context & ctx, float scalar, aclTensor * acl_dst) {
auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst, acl_scalar);
ggml_cann_release_resources(ctx, acl_scalar);
* initialization via memset or arbitrary values via fill_scalar).
* @return An aclTensor pointer created from the cached buffer.
*/
-static aclTensor* get_cache_acl_tensor(
- ggml_backend_cann_context& ctx,
- void** buffer,
- int64_t &cache_element,
- int64_t* ne,
- size_t* nb,
- ggml_type dtype,
- int64_t dims,
- float value) {
+static aclTensor * get_cache_acl_tensor(ggml_backend_cann_context & ctx,
+ void ** buffer,
+ int64_t & cache_element,
+ int64_t * ne,
+ size_t * nb,
+ ggml_type dtype,
+ int64_t dims,
+ float value) {
// Calculate total number of elements
int64_t n_element = 1;
for (int i = 0; i < dims; i++) {
cache_element = n_element;
// Initialize cache
- int64_t pool_ne[1] = { n_element };
- size_t pool_nb[1] = { ggml_type_size(dtype) };
- aclTensor* acl_value = ggml_cann_create_tensor(
- *buffer, ggml_cann_type_mapping(dtype), ggml_type_size(dtype),
- pool_ne, pool_nb, 1);
+ int64_t pool_ne[1] = { n_element };
+ size_t pool_nb[1] = { ggml_type_size(dtype) };
+ aclTensor * acl_value =
+ ggml_cann_create_tensor(*buffer, ggml_cann_type_mapping(dtype), ggml_type_size(dtype), pool_ne, pool_nb, 1);
aclnn_fill_scalar(ctx, value, acl_value);
ggml_cann_release_resources(ctx, acl_value);
}
- return ggml_cann_create_tensor(*buffer, ggml_cann_type_mapping(dtype),
- ggml_type_size(dtype), ne, nb, dims);
+ return ggml_cann_create_tensor(*buffer, ggml_cann_type_mapping(dtype), ggml_type_size(dtype), ne, nb, dims);
}
-void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
- ggml_tensor* src = dst->src[0];
+void ggml_cann_rms_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src = dst->src[0];
- aclTensor* acl_src = ggml_cann_create_tensor(src);
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+ aclTensor * acl_src = ggml_cann_create_tensor(src);
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst);
float eps;
memcpy(&eps, dst->op_params, sizeof(float));
for (int i = 1; i < GGML_MAX_DIMS; i++) {
acl_gamma_nb[i] = acl_gamma_nb[i - 1] * src->ne[i - 1];
}
- aclTensor* acl_gamma = get_cache_acl_tensor(
- ctx,
- &ctx.rms_norm_one_tensor_cache.cache,
- ctx.rms_norm_one_tensor_cache.size,
- src->ne,
- acl_gamma_nb,
- dst->type,
- 1, // dims
- 1.0f // value
+ aclTensor * acl_gamma = get_cache_acl_tensor(ctx, &ctx.rms_norm_one_tensor_cache.cache,
+ ctx.rms_norm_one_tensor_cache.size, src->ne, acl_gamma_nb, dst->type,
+ 1, // dims
+ 1.0f // value
);
// build rstd.
- int64_t acl_rstd_ne[] = {src->ne[1], src->ne[2], src->ne[3]};
- size_t acl_rstd_nb[GGML_MAX_DIMS - 1];
+ int64_t acl_rstd_ne[] = { src->ne[1], src->ne[2], src->ne[3] };
+ size_t acl_rstd_nb[GGML_MAX_DIMS - 1];
// rstd will always be F32.
acl_rstd_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
acl_rstd_nb[i] = acl_rstd_nb[i - 1] * acl_rstd_ne[i - 1];
}
- aclTensor* acl_rstd = get_cache_acl_tensor(
- ctx,
- &ctx.rms_norm_zero_tensor_cache.cache,
- ctx.rms_norm_zero_tensor_cache.size,
- acl_rstd_ne,
- acl_rstd_nb,
- GGML_TYPE_F32,
- GGML_MAX_DIMS - 1,
- 0.0f // value
- );
+ aclTensor * acl_rstd =
+ get_cache_acl_tensor(ctx, &ctx.rms_norm_zero_tensor_cache.cache, ctx.rms_norm_zero_tensor_cache.size,
+ acl_rstd_ne, acl_rstd_nb, GGML_TYPE_F32, GGML_MAX_DIMS - 1,
+ 0.0f // value
+ );
GGML_CANN_CALL_ACLNN_OP(ctx, RmsNorm, acl_src, acl_gamma, eps, acl_dst, acl_rstd);
ggml_cann_release_resources(ctx, acl_src, acl_dst, acl_gamma, acl_rstd);
}
// TODO: performace is low.
-void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
- float value) {
- ggml_tensor* src = dst->src[0];
+void ggml_cann_diag_mask(ggml_backend_cann_context & ctx, ggml_tensor * dst, float value) {
+ ggml_tensor * src = dst->src[0];
- aclTensor* acl_src = ggml_cann_create_tensor(src);
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+ aclTensor * acl_src = ggml_cann_create_tensor(src);
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst);
- const int n_past = ((int32_t*)dst->op_params)[0];
+ const int n_past = ((int32_t *) dst->op_params)[0];
ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), ggml_nbytes(src));
- void* buffer = one_tensor_allocator.get();
+ void * buffer = one_tensor_allocator.get();
- aclTensor* mask_tensor = ggml_cann_create_tensor(buffer, ggml_cann_type_mapping(src->type),
- ggml_type_size(src->type), src->ne, src->nb, GGML_MAX_DIMS);
+ aclTensor * mask_tensor = ggml_cann_create_tensor(buffer, ggml_cann_type_mapping(src->type),
+ ggml_type_size(src->type), src->ne, src->nb, GGML_MAX_DIMS);
aclnn_fill_scalar(ctx, value, mask_tensor);
- aclScalar* alpha = nullptr;
- float alphaValue = 1.0f;
- alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
+ aclScalar * alpha = nullptr;
+ float alphaValue = 1.0f;
+ alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceTriu, mask_tensor, n_past + 1);
GGML_CANN_CALL_ACLNN_OP(ctx, Tril, acl_src, n_past + 1, acl_dst);
* tensor.
* @param dims The number of dimensions in the tensor.
*/
-static void aclnn_permute(ggml_backend_cann_context& ctx, aclTensor* acl_src,
- aclTensor* acl_dst, int64_t* new_dim, uint64_t dims) {
- aclIntArray* acl_dims = aclCreateIntArray(new_dim, dims);
+static void aclnn_permute(ggml_backend_cann_context & ctx,
+ aclTensor * acl_src,
+ aclTensor * acl_dst,
+ int64_t * new_dim,
+ uint64_t dims) {
+ aclIntArray * acl_dims = aclCreateIntArray(new_dim, dims);
GGML_CANN_CALL_ACLNN_OP(ctx, Permute, acl_src, acl_dims, acl_dst);
ggml_cann_release_resources(ctx, acl_dims);
}
-static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context& ctx,
- ggml_tensor* dst,
- ggml_tensor* src1,
- aclTensor* tmp_cast_tensor,
- aclTensor* tmp_im2col_tensor) {
+static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context & ctx,
+ ggml_tensor * dst,
+ ggml_tensor * src1,
+ aclTensor * tmp_cast_tensor,
+ aclTensor * tmp_im2col_tensor) {
// Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
- int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]};
- size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]};
- aclTensor* acl_dst =
- ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1);
+ int64_t dst_ne[] = { dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3] };
+ size_t dst_nb[] = { dst->nb[0], dst->nb[1], dst->nb[3] };
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1);
- int64_t permute_dim[] = {0, 2, 1};
+ int64_t permute_dim[] = { 0, 2, 1 };
if (src1->type != dst->type) {
aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3);
} else {
ggml_cann_release_resources(ctx, acl_dst);
}
-static void ggml_cann_im2col_1d_post_process(
- ggml_backend_cann_context& ctx, ggml_tensor* dst, ggml_tensor* src1,
- aclTensor* tmp_cast_tensor, aclTensor* tmp_im2col_tensor,
- const std::vector<int64_t>& im2col_op_params) {
+static void ggml_cann_im2col_1d_post_process(ggml_backend_cann_context & ctx,
+ ggml_tensor * dst,
+ ggml_tensor * src1,
+ aclTensor * tmp_cast_tensor,
+ aclTensor * tmp_im2col_tensor,
+ const std::vector<int64_t> & im2col_op_params) {
// get params
- const int64_t KH = im2col_op_params[0];
- const int64_t KW = im2col_op_params[1];
- const int64_t IW = im2col_op_params[2];
- const int64_t IC = im2col_op_params[3];
- const int64_t N = im2col_op_params[4];
- const int64_t OH = im2col_op_params[5];
- const int64_t OW = im2col_op_params[6];
- const int64_t s0 = im2col_op_params[7];
- const int64_t p0 = im2col_op_params[8];
- const int64_t d0 = im2col_op_params[9];
+ const int64_t KH = im2col_op_params[0];
+ const int64_t KW = im2col_op_params[1];
+ const int64_t IW = im2col_op_params[2];
+ const int64_t IC = im2col_op_params[3];
+ const int64_t N = im2col_op_params[4];
+ const int64_t OH = im2col_op_params[5];
+ const int64_t OW = im2col_op_params[6];
+ const int64_t s0 = im2col_op_params[7];
+ const int64_t p0 = im2col_op_params[8];
+ const int64_t d0 = im2col_op_params[9];
const int64_t n_bytes_factor = im2col_op_params[10];
// Permute: [N, IC * KH * KW, OW * OH] ->
// [N, OW * OH * n_bytes_factor, IC * KH * KW]
ggml_cann_pool_alloc tmp_permute_allocator(ctx.pool());
tmp_permute_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
- void* tmp_permute_buffer = tmp_permute_allocator.get();
+ void * tmp_permute_buffer = tmp_permute_allocator.get();
- int64_t tmp_permute_ne[] = {IC * KH * KW, OW * OH * n_bytes_factor, N};
- size_t tmp_permute_nb[GGML_MAX_DIMS - 1];
+ int64_t tmp_permute_ne[] = { IC * KH * KW, OW * OH * n_bytes_factor, N };
+ size_t tmp_permute_nb[GGML_MAX_DIMS - 1];
tmp_permute_nb[0] = ggml_type_size(dst->type);
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
}
- aclTensor* tmp_permute_tensor = ggml_cann_create_tensor(
- tmp_permute_buffer, ggml_cann_type_mapping(dst->type),
- ggml_type_size(dst->type), tmp_permute_ne, tmp_permute_nb,
- GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
+ aclTensor * tmp_permute_tensor =
+ ggml_cann_create_tensor(tmp_permute_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
+ tmp_permute_ne, tmp_permute_nb, GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
- int64_t permute_dim[] = {0, 2, 1};
+ int64_t permute_dim[] = { 0, 2, 1 };
if (src1->type != dst->type) {
aclnn_permute(ctx, tmp_cast_tensor, tmp_permute_tensor, permute_dim, 3);
} else {
- aclnn_permute(ctx, tmp_im2col_tensor, tmp_permute_tensor, permute_dim,
- 3);
+ aclnn_permute(ctx, tmp_im2col_tensor, tmp_permute_tensor, permute_dim, 3);
}
// number of times the kernel moves in W dimension
const int n_step_w = (IW + 2 * p0 - d0 * (KW - 1) - 1) / s0 + 1;
- size_t offset;
- void *cur_dst_buffer = dst->data, *cur_permute_buffer = tmp_permute_buffer;
+ size_t offset;
+ void * cur_dst_buffer = dst->data, *cur_permute_buffer = tmp_permute_buffer;
// memory copy with offset to restore 1D im2col from 2d
if (IC > 1) {
- offset = IC * KH * KW * n_step_w * ggml_type_size(dst->type);
+ offset = IC * KH * KW * n_step_w * ggml_type_size(dst->type);
size_t size_cpy = KH * KW * ggml_type_size(dst->type);
for (int c = 0; c < IC; c++) {
- cur_permute_buffer = (char*)tmp_permute_buffer + offset +
- KH * KW * c * ggml_type_size(dst->type);
- cur_dst_buffer = (char*)dst->data +
- c * KH * KW * n_step_w * ggml_type_size(dst->type);
+ cur_permute_buffer = (char *) tmp_permute_buffer + offset + KH * KW * c * ggml_type_size(dst->type);
+ cur_dst_buffer = (char *) dst->data + c * KH * KW * n_step_w * ggml_type_size(dst->type);
for (int i = 0; i < n_step_w; i++) {
- ggml_cann_async_memcpy(ctx, cur_dst_buffer, cur_permute_buffer, size_cpy,
- ACL_MEMCPY_DEVICE_TO_DEVICE);
- cur_dst_buffer =
- (char*)cur_dst_buffer + KH * KW * ggml_type_size(dst->type);
- cur_permute_buffer = (char*)cur_permute_buffer +
- KH * KW * IC * ggml_type_size(dst->type);
+ ggml_cann_async_memcpy(ctx, cur_dst_buffer, cur_permute_buffer, size_cpy, ACL_MEMCPY_DEVICE_TO_DEVICE);
+ cur_dst_buffer = (char *) cur_dst_buffer + KH * KW * ggml_type_size(dst->type);
+ cur_permute_buffer = (char *) cur_permute_buffer + KH * KW * IC * ggml_type_size(dst->type);
}
}
} else {
- offset = KH * KW * n_step_w *
- ggml_type_size(dst->type); // equal to ggml_nbytes(dst)
- ggml_cann_async_memcpy(ctx, dst->data, (char*)tmp_permute_buffer + offset, offset,
- ACL_MEMCPY_DEVICE_TO_DEVICE);
+ offset = KH * KW * n_step_w * ggml_type_size(dst->type); // equal to ggml_nbytes(dst)
+ ggml_cann_async_memcpy(ctx, dst->data, (char *) tmp_permute_buffer + offset, offset,
+ ACL_MEMCPY_DEVICE_TO_DEVICE);
}
ggml_cann_release_resources(ctx, tmp_permute_tensor);
}
-void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
- ggml_tensor* src0 = dst->src[0]; // kernel
- ggml_tensor* src1 = dst->src[1]; // input
+void ggml_cann_im2col(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0]; // kernel
+ ggml_tensor * src1 = dst->src[1]; // input
GGML_TENSOR_BINARY_OP_LOCALS;
// aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
// im2col and do post-processing to restore it to 1D.
- const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
- const int32_t s1 = is_2D ? ((const int32_t*)(dst->op_params))[1] : 1;
- const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
- const int32_t p1 = is_2D ? ((const int32_t*)(dst->op_params))[3] : 1;
- const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
- const int32_t d1 = is_2D ? ((const int32_t*)(dst->op_params))[5] : 1;
-
- const int64_t N = ne13;
+ const bool is_2D = ((const int32_t *) (dst->op_params))[6] == 1;
+ const int32_t s0 = ((const int32_t *) (dst->op_params))[0];
+ const int32_t s1 = is_2D ? ((const int32_t *) (dst->op_params))[1] : 1;
+ const int32_t p0 = ((const int32_t *) (dst->op_params))[2];
+ const int32_t p1 = is_2D ? ((const int32_t *) (dst->op_params))[3] : 1;
+ const int32_t d0 = ((const int32_t *) (dst->op_params))[4];
+ const int32_t d1 = is_2D ? ((const int32_t *) (dst->op_params))[5] : 1;
+
+ const int64_t N = ne13;
const int64_t IC = ne12;
const int64_t KH = ne01;
const int64_t KW = ne00;
const int64_t n_bytes_factor = is_2D ? 1 : 3;
// im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH * n_bytes_factor]
- aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
- int64_t tmp_im2col_ne[] = {OW * OH * n_bytes_factor, IC * KH * KW, N};
- size_t tmp_im2col_nb[GGML_MAX_DIMS - 1];
+ aclTensor * acl_src1 = ggml_cann_create_tensor(src1);
+ int64_t tmp_im2col_ne[] = { OW * OH * n_bytes_factor, IC * KH * KW, N };
+ size_t tmp_im2col_nb[GGML_MAX_DIMS - 1];
tmp_im2col_nb[0] = ggml_type_size(src1->type);
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
// Calculate im2col.
// If dst is f16, tmp_buffer is f32, we need alloc src.typesize *
// dst.elemcount.
- ggml_cann_pool_alloc im2col_allocator(
- ctx.pool(),
- ggml_nelements(dst) * ggml_element_size(src1) * n_bytes_factor);
- void* tmp_im2col_buffer = im2col_allocator.get();
-
- aclTensor* tmp_im2col_tensor = ggml_cann_create_tensor(
- tmp_im2col_buffer, ggml_cann_type_mapping(src1->type),
- ggml_type_size(src1->type), tmp_im2col_ne, tmp_im2col_nb,
- GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
-
- std::vector<int64_t> kernel_dims = {KH, KW};
- std::vector<int64_t> dilation_size = {d1, d0};
- std::vector<int64_t> padding_dims = {p1, p0};
- std::vector<int64_t> stride_dims = {s1, s0};
- auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
- auto* dilations = aclCreateIntArray(dilation_size.data(), 2);
- auto* paddings = aclCreateIntArray(padding_dims.data(), 2);
- auto* strides = aclCreateIntArray(stride_dims.data(), 2);
- GGML_CANN_CALL_ACLNN_OP(ctx, Im2col, acl_src1, kernel_size, dilations,
- paddings, strides, tmp_im2col_tensor);
+ ggml_cann_pool_alloc im2col_allocator(ctx.pool(), ggml_nelements(dst) * ggml_element_size(src1) * n_bytes_factor);
+ void * tmp_im2col_buffer = im2col_allocator.get();
+
+ aclTensor * tmp_im2col_tensor =
+ ggml_cann_create_tensor(tmp_im2col_buffer, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type),
+ tmp_im2col_ne, tmp_im2col_nb, GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
+
+ std::vector<int64_t> kernel_dims = { KH, KW };
+ std::vector<int64_t> dilation_size = { d1, d0 };
+ std::vector<int64_t> padding_dims = { p1, p0 };
+ std::vector<int64_t> stride_dims = { s1, s0 };
+ auto * kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
+ auto * dilations = aclCreateIntArray(dilation_size.data(), 2);
+ auto * paddings = aclCreateIntArray(padding_dims.data(), 2);
+ auto * strides = aclCreateIntArray(stride_dims.data(), 2);
+ GGML_CANN_CALL_ACLNN_OP(ctx, Im2col, acl_src1, kernel_size, dilations, paddings, strides, tmp_im2col_tensor);
// Cast if dst is f16.
- aclTensor* tmp_cast_tensor = nullptr;
+ aclTensor * tmp_cast_tensor = nullptr;
ggml_cann_pool_alloc tmp_cast_allocator(ctx.pool());
- void* tmp_cast_buffer = nullptr;
+ void * tmp_cast_buffer = nullptr;
if (src1->type != dst->type) {
tmp_cast_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
tmp_cast_buffer = tmp_cast_allocator.get();
temp_cast_nb[i] = temp_cast_nb[i - 1] * tmp_im2col_ne[i - 1];
}
- tmp_cast_tensor = ggml_cann_create_tensor(
- tmp_cast_buffer, ggml_cann_type_mapping(dst->type),
- ggml_type_size(dst->type), tmp_im2col_ne, temp_cast_nb,
- GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
+ tmp_cast_tensor =
+ ggml_cann_create_tensor(tmp_cast_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
+ tmp_im2col_ne, temp_cast_nb, GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
aclnn_cast(ctx, tmp_im2col_tensor, tmp_cast_tensor, ggml_cann_type_mapping(dst->type));
}
// post-processing
if (is_2D) {
- ggml_cann_im2col_2d_post_process(ctx, dst, src1, tmp_cast_tensor,
- tmp_im2col_tensor);
+ ggml_cann_im2col_2d_post_process(ctx, dst, src1, tmp_cast_tensor, tmp_im2col_tensor);
} else {
- std::vector<int64_t> im2col_op_params = {
- KH, KW, IW, IC, N, OH, OW, s0, p0, d0, n_bytes_factor};
- ggml_cann_im2col_1d_post_process(ctx, dst, src1, tmp_cast_tensor,
- tmp_im2col_tensor, im2col_op_params);
+ std::vector<int64_t> im2col_op_params = { KH, KW, IW, IC, N, OH, OW, s0, p0, d0, n_bytes_factor };
+ ggml_cann_im2col_1d_post_process(ctx, dst, src1, tmp_cast_tensor, tmp_im2col_tensor, im2col_op_params);
}
- ggml_cann_release_resources(ctx, acl_src1, tmp_im2col_tensor, tmp_cast_tensor,
- kernel_size, dilations, paddings, strides);
+ ggml_cann_release_resources(ctx, acl_src1, tmp_im2col_tensor, tmp_cast_tensor, kernel_size, dilations, paddings,
+ strides);
}
/**
* @param ctx The context for the CANN backend operations.
* @param acl_src The tensor on which the exponential function will be applied.
*/
-static void aclnn_exp(ggml_backend_cann_context& ctx, aclTensor* acl_src) {
+static void aclnn_exp(ggml_backend_cann_context & ctx, aclTensor * acl_src) {
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceExp, acl_src);
}
-void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
- aclTensor* acl_dst) {
- if(acl_dst == nullptr) {
+void aclnn_cos(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
+ if (acl_dst == nullptr) {
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceCos, acl_src);
} else {
GGML_CANN_CALL_ACLNN_OP(ctx, Cos, acl_src, acl_dst);
}
}
-void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
- aclTensor* acl_dst) {
- if(acl_dst == nullptr) {
+void aclnn_sin(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
+ if (acl_dst == nullptr) {
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSin, acl_src);
} else {
GGML_CANN_CALL_ACLNN_OP(ctx, Sin, acl_src, acl_dst);
}
}
-void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
- ggml_tensor* dst) {
- const ggml_tensor* src = dst->src[0];
+void ggml_cann_timestep_embedding(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ const ggml_tensor * src = dst->src[0];
GGML_ASSERT(src->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
- const int dim = dst->op_params[0];
+ const int dim = dst->op_params[0];
const int max_period = dst->op_params[1];
- int half = dim / 2;
+ int half = dim / 2;
- aclTensor* acl_src = ggml_cann_create_tensor(src);
+ aclTensor * acl_src = ggml_cann_create_tensor(src);
// arange: [0, ..., half)
- float start = 0;
- float stop = half;
- float step = 1;
+ float start = 0;
+ float stop = half;
+ float step = 1;
int64_t n_elements_arange = half;
- int64_t tmp_arange_ne[] = {half};
- size_t tmp_arange_nb[] = {sizeof(dst->type)};
+ int64_t tmp_arange_ne[] = { half };
+ size_t tmp_arange_nb[] = { sizeof(dst->type) };
ggml_cann_pool_alloc arange_allocator(ctx.pool(), half * sizeof(dst->type));
- void* tmp_arange_buffer = arange_allocator.get();
- aclTensor* tmp_arange_tensor = ggml_cann_create_tensor(
- tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
- ggml_type_size(dst->type), tmp_arange_ne, tmp_arange_nb,
- GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+ void * tmp_arange_buffer = arange_allocator.get();
+ aclTensor * tmp_arange_tensor =
+ ggml_cann_create_tensor(tmp_arange_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
+ tmp_arange_ne, tmp_arange_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
aclnn_arange(ctx, tmp_arange_tensor, start, stop, step, n_elements_arange);
// freq
float freq_param = -logf(max_period) / half;
- bool inplace = true;
+ bool inplace = true;
aclnn_muls(ctx, tmp_arange_tensor, freq_param, nullptr, inplace);
aclnn_exp(ctx, tmp_arange_tensor);
// permute: src [0,1,2,3]->[0,1,3,2]
- int64_t tmp_permute_ne[] = {src->ne[1], src->ne[0], src->ne[2], src->ne[3]};
- size_t tmp_permute_nb[GGML_MAX_DIMS];
+ int64_t tmp_permute_ne[] = { src->ne[1], src->ne[0], src->ne[2], src->ne[3] };
+ size_t tmp_permute_nb[GGML_MAX_DIMS];
tmp_permute_nb[0] = ggml_type_size(src->type);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
}
ggml_cann_pool_alloc permute_allocator(ctx.pool(), ggml_nbytes(src));
- void* tmp_permute_buffer = permute_allocator.get();
- aclTensor* tmp_permute_tensor = ggml_cann_create_tensor(
- tmp_permute_buffer, ggml_cann_type_mapping(src->type),
- ggml_type_size(src->type), tmp_permute_ne, tmp_permute_nb,
- GGML_MAX_DIMS, ACL_FORMAT_ND);
- int64_t permute_dim[] = {0, 1, 3, 2};
- int64_t num_dims = 4;
+ void * tmp_permute_buffer = permute_allocator.get();
+ aclTensor * tmp_permute_tensor =
+ ggml_cann_create_tensor(tmp_permute_buffer, ggml_cann_type_mapping(src->type), ggml_type_size(src->type),
+ tmp_permute_ne, tmp_permute_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
+ int64_t permute_dim[] = { 0, 1, 3, 2 };
+ int64_t num_dims = 4;
aclnn_permute(ctx, acl_src, tmp_permute_tensor, permute_dim, num_dims);
// timestep * freq
- int64_t tmp_mul_ne[] = {src->ne[1] * half, src->ne[0], src->ne[2],
- src->ne[3]};
- size_t tmp_mul_nb[GGML_MAX_DIMS];
+ int64_t tmp_mul_ne[] = { src->ne[1] * half, src->ne[0], src->ne[2], src->ne[3] };
+ size_t tmp_mul_nb[GGML_MAX_DIMS];
tmp_mul_nb[0] = ggml_type_size(src->type);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
tmp_mul_nb[i] = tmp_mul_nb[i - 1] * tmp_mul_ne[i - 1];
}
- int mul_nelements =
- src->ne[1] * half * src->ne[0] * src->ne[2] * src->ne[3];
+ int mul_nelements = src->ne[1] * half * src->ne[0] * src->ne[2] * src->ne[3];
- ggml_cann_pool_alloc mul_allocator(
- ctx.pool(), mul_nelements * ggml_type_size(src->type));
- void* tmp_mul_buffer = mul_allocator.get();
- aclTensor* tmp_mul_tensor = ggml_cann_create_tensor(
- tmp_mul_buffer, ggml_cann_type_mapping(src->type),
- ggml_type_size(src->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS,
- ACL_FORMAT_ND);
+ ggml_cann_pool_alloc mul_allocator(ctx.pool(), mul_nelements * ggml_type_size(src->type));
+ void * tmp_mul_buffer = mul_allocator.get();
+ aclTensor * tmp_mul_tensor =
+ ggml_cann_create_tensor(tmp_mul_buffer, ggml_cann_type_mapping(src->type), ggml_type_size(src->type),
+ tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
aclnn_mul(ctx, tmp_permute_tensor, tmp_arange_tensor, tmp_mul_tensor);
// cos
- ggml_cann_pool_alloc cos_allocator(
- ctx.pool(), mul_nelements * ggml_type_size(src->type));
- void* tmp_cos_buffer = cos_allocator.get();
- aclTensor* tmp_cos_tensor = ggml_cann_create_tensor(
- tmp_cos_buffer, ggml_cann_type_mapping(dst->type),
- ggml_type_size(dst->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS,
- ACL_FORMAT_ND);
+ ggml_cann_pool_alloc cos_allocator(ctx.pool(), mul_nelements * ggml_type_size(src->type));
+ void * tmp_cos_buffer = cos_allocator.get();
+ aclTensor * tmp_cos_tensor =
+ ggml_cann_create_tensor(tmp_cos_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
+ tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
aclnn_cos(ctx, tmp_mul_tensor, tmp_cos_tensor);
// sin
- ggml_cann_pool_alloc sin_allocator(
- ctx.pool(), mul_nelements * ggml_type_size(src->type));
- void* tmp_sin_buffer = sin_allocator.get();
- aclTensor* tmp_sin_tensor = ggml_cann_create_tensor(
- tmp_sin_buffer, ggml_cann_type_mapping(dst->type),
- ggml_type_size(dst->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS,
- ACL_FORMAT_ND);
+ ggml_cann_pool_alloc sin_allocator(ctx.pool(), mul_nelements * ggml_type_size(src->type));
+ void * tmp_sin_buffer = sin_allocator.get();
+ aclTensor * tmp_sin_tensor =
+ ggml_cann_create_tensor(tmp_sin_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
+ tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
aclnn_sin(ctx, tmp_mul_tensor, tmp_sin_tensor);
// concat
- int64_t concat_dim = 3;
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
- aclTensor* tensors[] = {tmp_cos_tensor, tmp_sin_tensor};
- aclTensorList* tensor_list = aclCreateTensorList(tensors, 2);
+ int64_t concat_dim = 3;
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst);
+ aclTensor * tensors[] = { tmp_cos_tensor, tmp_sin_tensor };
+ aclTensorList * tensor_list = aclCreateTensorList(tensors, 2);
aclnn_concat(ctx, tensor_list, acl_dst, concat_dim);
// release
// segmentation fault when delete both tensorList and his elements.
- ggml_cann_release_resources(ctx, tensor_list, acl_src, tmp_arange_tensor,
- tmp_permute_tensor, tmp_mul_tensor, acl_dst);
+ ggml_cann_release_resources(ctx, tensor_list, acl_src, tmp_arange_tensor, tmp_permute_tensor, tmp_mul_tensor,
+ acl_dst);
}
/**
* @param acl_exp The exponent tensor, each element of which is used to raise
* the corresponding element in the destination tensor.
*/
-static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx,
- aclTensor* acl_dst, aclTensor* acl_exp) {
+static void aclnn_pow_tensor_tensor(ggml_backend_cann_context & ctx, aclTensor * acl_dst, aclTensor * acl_exp) {
GGML_CANN_CALL_ACLNN_OP(ctx, InplacePowTensorTensor, acl_dst, acl_exp);
}
* @param step Step size for the exponent increment.
* @param dtype Data type for slope tensor.
*/
-static void aclnn_get_slope_inner(ggml_backend_cann_context& ctx, void* slope_buffer,
- float m, int64_t size, float start, float stop, float step, ggml_type dtype){
- aclDataType acl_type = ggml_cann_type_mapping(dtype);
- size_t type_size = ggml_type_size(dtype);
-
- int64_t ne[] = {size};
- size_t nb[] = {type_size};
+static void aclnn_get_slope_inner(ggml_backend_cann_context & ctx,
+ void * slope_buffer,
+ float m,
+ int64_t size,
+ float start,
+ float stop,
+ float step,
+ ggml_type dtype) {
+ aclDataType acl_type = ggml_cann_type_mapping(dtype);
+ size_t type_size = ggml_type_size(dtype);
+
+ int64_t ne[] = { size };
+ size_t nb[] = { type_size };
ggml_cann_pool_alloc arange_allocator(ctx.pool(), size * type_size);
- void* arange_buffer = arange_allocator.get();
+ void * arange_buffer = arange_allocator.get();
- aclTensor* arange_tensor = ggml_cann_create_tensor(
- arange_buffer, acl_type, type_size, ne, nb, 1);
+ aclTensor * arange_tensor = ggml_cann_create_tensor(arange_buffer, acl_type, type_size, ne, nb, 1);
aclnn_arange(ctx, arange_tensor, start, stop, step, size);
- aclTensor* slope_tensor = ggml_cann_create_tensor(
- slope_buffer, acl_type, type_size, ne, nb, 1);
+ aclTensor * slope_tensor = ggml_cann_create_tensor(slope_buffer, acl_type, type_size, ne, nb, 1);
- aclScalar* sc = aclCreateScalar(&m, aclDataType::ACL_FLOAT);
+ aclScalar * sc = aclCreateScalar(&m, aclDataType::ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, sc, arange_tensor, slope_tensor);
ggml_cann_release_resources(ctx, sc, arange_tensor, slope_tensor);
* @param dtype Data type for slope tensor.
*
*/
-static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head,
- void* slope_buffer, float max_bias, ggml_type dtype) {
+static void aclnn_get_slope(ggml_backend_cann_context & ctx,
+ int64_t n_head,
+ void * slope_buffer,
+ float max_bias,
+ ggml_type dtype) {
const int n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
float m0 = powf(2.0f, -(max_bias) / n_head_log2);
end = 2 * ((n_head - 1) - n_head_log2) + 1;
step = 2;
count = n_head - n_head_log2;
- aclnn_get_slope_inner(
- ctx, (char *) slope_buffer + n_head_log2 * sizeof(float),
- m1, count, start, end + 1, step, dtype);
+ aclnn_get_slope_inner(ctx, (char *) slope_buffer + n_head_log2 * sizeof(float), m1, count, start, end + 1, step,
+ dtype);
}
}
* - Write data into dst_ptr using only the shape information of the dst tensor.
* - `GGML_MAX_DIMS + 2` is used to extend tensor dimensions for broadcasting.
*/
-static void aclnn_add_alibi(ggml_backend_cann_context& ctx, ggml_tensor* mask,
- ggml_tensor* dst, void* dst_ptr, float max_bias) {
- void* slope_buffer = nullptr;
- void* bias_buffer = nullptr;
+static void aclnn_add_alibi(ggml_backend_cann_context & ctx,
+ ggml_tensor * mask,
+ ggml_tensor * dst,
+ void * dst_ptr,
+ float max_bias) {
+ void * slope_buffer = nullptr;
+ void * bias_buffer = nullptr;
if (max_bias > 0.0f) {
- int64_t n_heads = dst->ne[2];
+ int64_t n_heads = dst->ne[2];
ggml_cann_pool_alloc slope_allocator(ctx.pool(), n_heads * sizeof(float));
slope_buffer = slope_allocator.get();
- ggml_cann_pool_alloc bias_allocator(
- ctx.pool(), ggml_nelements(dst) * ggml_element_size(dst));
+ ggml_cann_pool_alloc bias_allocator(ctx.pool(), ggml_nelements(dst) * ggml_element_size(dst));
bias_buffer = bias_allocator.get();
aclnn_get_slope(ctx, n_heads, slope_buffer, max_bias, GGML_TYPE_F32);
}
// broadcast the mask across rows
int64_t mask_ne[] = { mask->ne[0], dst->ne[1], mask->ne[2], 1, mask->ne[3], 1 };
- size_t mask_nb[] = {
- mask_nb[0] = mask->nb[0], mask_nb[1] = mask->nb[1], mask_nb[2] = mask->nb[2],
- mask_nb[3] = mask->nb[2], mask_nb[4] = mask->nb[3], mask_nb[5] = mask->nb[3]
- };
+ size_t mask_nb[] = { mask_nb[0] = mask->nb[0], mask_nb[1] = mask->nb[1], mask_nb[2] = mask->nb[2],
+ mask_nb[3] = mask->nb[2], mask_nb[4] = mask->nb[3], mask_nb[5] = mask->nb[3] };
int64_t dst_ne[] = { dst->ne[0], dst->ne[1], mask->ne[2], nr2, mask->ne[3], nr3 };
- size_t dst_nb[] = {
- dst_nb[0] = dst->nb[0], dst_nb[1] = dst->nb[1], dst_nb[2] = dst->nb[2],
- dst_nb[3] = dst->nb[2], dst_nb[4] = dst->nb[3], dst_nb[5] = dst->nb[3]
- };
+ size_t dst_nb[] = { dst_nb[0] = dst->nb[0], dst_nb[1] = dst->nb[1], dst_nb[2] = dst->nb[2],
+ dst_nb[3] = dst->nb[2], dst_nb[4] = dst->nb[3], dst_nb[5] = dst->nb[3] };
// slope is a 1 dim tensor, slope.ne2 == dst.ne2
int64_t slope_ne[] = { 1, 1, mask->ne[2], nr2, 1, 1 };
slope_nb[i] = slope_nb[i - 1] * slope_ne[i - 1];
}
- aclTensor* acl_slope = ggml_cann_create_tensor(
- slope_buffer, ACL_FLOAT, sizeof(float),
- slope_ne, slope_nb, GGML_MAX_DIMS + 2);
- aclTensor* acl_mask = ggml_cann_create_tensor(
- mask, mask_ne, mask_nb, GGML_MAX_DIMS + 2);
+ aclTensor * acl_slope =
+ ggml_cann_create_tensor(slope_buffer, ACL_FLOAT, sizeof(float), slope_ne, slope_nb, GGML_MAX_DIMS + 2);
+ aclTensor * acl_mask = ggml_cann_create_tensor(mask, mask_ne, mask_nb, GGML_MAX_DIMS + 2);
// write data into dst_ptr using only the shape information of the dst tensor.
- aclTensor* acl_dst = ggml_cann_create_tensor(
- dst_ptr, ggml_cann_type_mapping(dst->type),
- ggml_type_size(dst->type), dst_ne, dst_nb,
- GGML_MAX_DIMS + 2);
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst_ptr, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
+ dst_ne, dst_nb, GGML_MAX_DIMS + 2);
if (max_bias > 0.0f) {
int64_t bias_ne[] = { mask->ne[0], dst->ne[1], mask->ne[2], nr2, mask->ne[3], 1 };
for (int i = 1; i < GGML_MAX_DIMS + 2; i++) {
bias_nb[i] = bias_nb[i - 1] * bias_ne[i - 1];
}
- aclTensor* bias_tensor = ggml_cann_create_tensor(
- bias_buffer, ACL_FLOAT, sizeof(float),
- bias_ne, bias_nb, GGML_MAX_DIMS + 2);
+ aclTensor * bias_tensor =
+ ggml_cann_create_tensor(bias_buffer, ACL_FLOAT, sizeof(float), bias_ne, bias_nb, GGML_MAX_DIMS + 2);
aclnn_mul(ctx, acl_slope, acl_mask, bias_tensor);
aclnn_add(ctx, acl_dst, bias_tensor);
* @param acl_dst The destination tensor where the softmax results will be
* stored.
*/
-static void aclnn_softmax(ggml_backend_cann_context & ctx,
- aclTensor* acl_src, int64_t dim, aclTensor * acl_dst) {
+static void aclnn_softmax(ggml_backend_cann_context & ctx, aclTensor * acl_src, int64_t dim, aclTensor * acl_dst) {
GGML_CANN_CALL_ACLNN_OP(ctx, Softmax, acl_src, dim, acl_dst);
}
void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
- ggml_tensor* src0 = dst->src[0];
- ggml_tensor* src1 = dst->src[1]; // mask
+ ggml_tensor * src0 = dst->src[0];
+ ggml_tensor * src1 = dst->src[1]; // mask
- aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+ aclTensor * acl_src0 = ggml_cann_create_tensor(src0);
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst);
float scale = 1.0f;
float max_bias = 0.0f;
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
// input mul scale
- aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
+ aclScalar * acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
ggml_cann_pool_alloc src_tensor_allocator(ctx.pool(), ggml_nbytes(src0));
- void* src_tensor_buffer = src_tensor_allocator.get();
- aclTensor* softmax_tensor = ggml_cann_create_tensor(
- src_tensor_buffer, ggml_cann_type_mapping(src0->type),
- ggml_element_size(src0), src0->ne, src0->nb,GGML_MAX_DIMS);
+ void * src_tensor_buffer = src_tensor_allocator.get();
+ aclTensor * softmax_tensor = ggml_cann_create_tensor(src_tensor_buffer, ggml_cann_type_mapping(src0->type),
+ ggml_element_size(src0), src0->ne, src0->nb, GGML_MAX_DIMS);
aclnn_muls(ctx, acl_src0, scale, softmax_tensor, false);
* @param index The index tensor specifying the indices to select from the source tensor.
* @param type The data type of the source and destination tensors.
*/
-static void aclnn_index_select_4d(ggml_backend_cann_context& ctx,
- void* src_buffer,int64_t* src_ne, size_t* src_nb,
- void* dst_buffer, int64_t* dst_ne, size_t* dst_nb,
- ggml_tensor* index, ggml_type type) {
+static void aclnn_index_select_4d(ggml_backend_cann_context & ctx,
+ void * src_buffer,
+ int64_t * src_ne,
+ size_t * src_nb,
+ void * dst_buffer,
+ int64_t * dst_ne,
+ size_t * dst_nb,
+ ggml_tensor * index,
+ ggml_type type) {
for (int64_t i = 0; i < src_ne[3]; i++) {
for (int64_t j = 0; j < src_ne[2]; j++) {
// src
- aclTensor* acl_src_tensor = ggml_cann_create_tensor(
- (char*)src_buffer + i * src_nb[3] + j * src_nb[2],
- ggml_cann_type_mapping(type), ggml_type_size(type),
- src_ne, src_nb, 2);
+ aclTensor * acl_src_tensor =
+ ggml_cann_create_tensor((char *) src_buffer + i * src_nb[3] + j * src_nb[2],
+ ggml_cann_type_mapping(type), ggml_type_size(type), src_ne, src_nb, 2);
// index
- aclTensor* acl_index = ggml_cann_create_tensor(
- (char*)index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
- ggml_cann_type_mapping(index->type), ggml_element_size(index),
- index->ne, index->nb, 1);
+ aclTensor * acl_index = ggml_cann_create_tensor(
+ (char *) index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
+ ggml_cann_type_mapping(index->type), ggml_element_size(index), index->ne, index->nb, 1);
// out
- aclTensor* acl_out = ggml_cann_create_tensor(
- (char*)dst_buffer + i * dst_nb[3] + j * dst_nb[2],
- ggml_cann_type_mapping(type), ggml_type_size(type),
- dst_ne, dst_nb, 2);
+ aclTensor * acl_out =
+ ggml_cann_create_tensor((char *) dst_buffer + i * dst_nb[3] + j * dst_nb[2],
+ ggml_cann_type_mapping(type), ggml_type_size(type), dst_ne, dst_nb, 2);
GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, acl_src_tensor, 0, acl_index, acl_out);
ggml_cann_release_resources(ctx, acl_src_tensor, acl_index, acl_out);
}
* @param index The index tensor specifying target positions in the destination tensor.
* @param type The data type of the source and destination tensors.
*/
-static void aclnn_index_copy_4d(ggml_backend_cann_context& ctx,
- void* src_buffer,int64_t* src_ne, size_t* src_nb,
- void* dst_buffer, int64_t* dst_ne, size_t* dst_nb,
- ggml_tensor* index, ggml_type type) {
+static void aclnn_index_copy_4d(ggml_backend_cann_context & ctx,
+ void * src_buffer,
+ int64_t * src_ne,
+ size_t * src_nb,
+ void * dst_buffer,
+ int64_t * dst_ne,
+ size_t * dst_nb,
+ ggml_tensor * index,
+ ggml_type type) {
for (int64_t i = 0; i < src_ne[3]; i++) {
for (int64_t j = 0; j < src_ne[2]; j++) {
// src
- aclTensor* acl_src_tensor = ggml_cann_create_tensor(
- (char*)src_buffer + i * src_nb[3] + j * src_nb[2],
- ggml_cann_type_mapping(type), ggml_type_size(type),
- src_ne, src_nb, 2);
+ aclTensor * acl_src_tensor =
+ ggml_cann_create_tensor((char *) src_buffer + i * src_nb[3] + j * src_nb[2],
+ ggml_cann_type_mapping(type), ggml_type_size(type), src_ne, src_nb, 2);
// index
- aclTensor* acl_index = ggml_cann_create_tensor(
- (char*)index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
- ggml_cann_type_mapping(index->type), ggml_element_size(index),
- index->ne, index->nb, 1);
+ aclTensor * acl_index = ggml_cann_create_tensor(
+ (char *) index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
+ ggml_cann_type_mapping(index->type), ggml_element_size(index), index->ne, index->nb, 1);
// out
- aclTensor* acl_out = ggml_cann_create_tensor(
- (char*)dst_buffer + i * dst_nb[3] + j * dst_nb[2],
- ggml_cann_type_mapping(type), ggml_type_size(type),
- dst_ne, dst_nb, 2);
+ aclTensor * acl_out =
+ ggml_cann_create_tensor((char *) dst_buffer + i * dst_nb[3] + j * dst_nb[2],
+ ggml_cann_type_mapping(type), ggml_type_size(type), dst_ne, dst_nb, 2);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceIndexCopy, acl_out, 0, acl_index, acl_src_tensor);
ggml_cann_release_resources(ctx, acl_src_tensor, acl_index, acl_out);
}
}
}
-void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
- ggml_tensor* src0 = dst->src[0]; // src
- ggml_tensor* src1 = dst->src[1]; // index
+void ggml_cann_get_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0]; // src
+ ggml_tensor * src1 = dst->src[1]; // index
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
switch (src0->type) {
case GGML_TYPE_F16:
case GGML_TYPE_F32:
- if(src0->type == dst->type) {
- aclnn_index_select_4d(ctx, src0->data, src0->ne, src0->nb,
- dst->data, dst->ne, dst->nb,
- src1, dst->type);
+ if (src0->type == dst->type) {
+ aclnn_index_select_4d(ctx, src0->data, src0->ne, src0->nb, dst->data, dst->ne, dst->nb, src1,
+ dst->type);
} else {
- aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
- ggml_cann_pool_alloc src_buffer_allocator(
- ctx.pool(), ggml_nelements(src0) * ggml_element_size(dst));
- void* src_trans_buffer = src_buffer_allocator.get();
- size_t src_trans_nb[GGML_MAX_DIMS];
+ aclTensor * acl_src0 = ggml_cann_create_tensor(src0);
+ ggml_cann_pool_alloc src_buffer_allocator(ctx.pool(), ggml_nelements(src0) * ggml_element_size(dst));
+ void * src_trans_buffer = src_buffer_allocator.get();
+ size_t src_trans_nb[GGML_MAX_DIMS];
src_trans_nb[0] = dst->nb[0];
for (int i = 1; i < GGML_MAX_DIMS; i++) {
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
}
- aclTensor* src_trans_tensor = ggml_cann_create_tensor(
- src_trans_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
- src0->ne, src_trans_nb, GGML_MAX_DIMS);
+ aclTensor * src_trans_tensor =
+ ggml_cann_create_tensor(src_trans_buffer, ggml_cann_type_mapping(dst->type),
+ ggml_type_size(dst->type), src0->ne, src_trans_nb, GGML_MAX_DIMS);
aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type));
- aclnn_index_select_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb,
- dst->data, dst->ne, dst->nb,
- src1, dst->type);
+ aclnn_index_select_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb, dst->data, dst->ne, dst->nb, src1,
+ dst->type);
ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor);
}
break;
- case GGML_TYPE_Q8_0: {
- // add 1 dim for bcast mul.
- size_t weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1],
- dequant_nb[GGML_MAX_DIMS + 1];
- int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1],
- *dequant_ne;
- int64_t scale_offset = 0;
- // [3,4,5,64] -> [3,4,5,2,32]
- weight_ne[0] = QK8_0;
- weight_ne[1] = src0->ne[0] / QK8_0;
- weight_nb[0] = sizeof(int8_t);
- weight_nb[1] = weight_nb[0] * weight_ne[0];
- for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
- weight_ne[i] = src0->ne[i - 1];
- weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1];
- }
- // [3,4,5,64] -> [3,4,5,2,1]
- scale_ne[0] = 1;
- scale_ne[1] = src0->ne[0] / QK8_0;
- scale_nb[0] = sizeof(uint16_t);
- scale_nb[1] = scale_nb[0] * scale_ne[0];
- for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
- scale_ne[i] = src0->ne[i - 1];
- scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1];
- }
- // [3,4,5,64] -> [3,4,5,2,32]
- dequant_ne = weight_ne;
- dequant_nb[0] = ggml_type_size(dst->type);
- for (int i = 1; i < GGML_MAX_DIMS + 1; i++) {
- dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1];
- }
- scale_offset = ggml_nelements(src0) * sizeof(int8_t);
- ggml_cann_pool_alloc dequant_buffer_allocator(
- ctx.pool(), ggml_nelements(src0) * ggml_type_size(dst->type));
- aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
- src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb,
- GGML_MAX_DIMS + 1);
- aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
- src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb,
- GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
- aclTensor* dequant_tensor = ggml_cann_create_tensor(
- dequant_buffer_allocator.get(), ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
- dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
- aclnn_mul(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor);
- dequant_nb[0] = ggml_type_size(dst->type);
- dequant_ne = src0->ne;
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
- dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
- }
- aclnn_index_select_4d(ctx, dequant_buffer_allocator.get(),
- dequant_ne, dequant_nb,
- dst->data, dst->ne, dst->nb,
- src1, dst->type);
+ case GGML_TYPE_Q8_0:
+ {
+ // add 1 dim for bcast mul.
+ size_t weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1], dequant_nb[GGML_MAX_DIMS + 1];
+ int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1], *dequant_ne;
+ int64_t scale_offset = 0;
+ // [3,4,5,64] -> [3,4,5,2,32]
+ weight_ne[0] = QK8_0;
+ weight_ne[1] = src0->ne[0] / QK8_0;
+ weight_nb[0] = sizeof(int8_t);
+ weight_nb[1] = weight_nb[0] * weight_ne[0];
+ for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
+ weight_ne[i] = src0->ne[i - 1];
+ weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1];
+ }
+ // [3,4,5,64] -> [3,4,5,2,1]
+ scale_ne[0] = 1;
+ scale_ne[1] = src0->ne[0] / QK8_0;
+ scale_nb[0] = sizeof(uint16_t);
+ scale_nb[1] = scale_nb[0] * scale_ne[0];
+ for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
+ scale_ne[i] = src0->ne[i - 1];
+ scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1];
+ }
+ // [3,4,5,64] -> [3,4,5,2,32]
+ dequant_ne = weight_ne;
+ dequant_nb[0] = ggml_type_size(dst->type);
+ for (int i = 1; i < GGML_MAX_DIMS + 1; i++) {
+ dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1];
+ }
+ scale_offset = ggml_nelements(src0) * sizeof(int8_t);
+ ggml_cann_pool_alloc dequant_buffer_allocator(ctx.pool(),
+ ggml_nelements(src0) * ggml_type_size(dst->type));
+ aclTensor * acl_weight_tensor = ggml_cann_create_tensor(src0->data, ACL_INT8, sizeof(int8_t), weight_ne,
+ weight_nb, GGML_MAX_DIMS + 1);
+ aclTensor * acl_scale_tensor =
+ ggml_cann_create_tensor(src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb,
+ GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
+ aclTensor * dequant_tensor =
+ ggml_cann_create_tensor(dequant_buffer_allocator.get(), ggml_cann_type_mapping(dst->type),
+ ggml_type_size(dst->type), dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
+ aclnn_mul(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor);
+ dequant_nb[0] = ggml_type_size(dst->type);
+ dequant_ne = src0->ne;
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
+ }
+ aclnn_index_select_4d(ctx, dequant_buffer_allocator.get(), dequant_ne, dequant_nb, dst->data, dst->ne,
+ dst->nb, src1, dst->type);
- ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor);
- break;
- }
+ ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor);
+ break;
+ }
default:
GGML_ABORT("Unsupported tensor type for GGML_OP_GET_ROWS");
break;
}
}
-void ggml_cann_set_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
- ggml_tensor* src0 = dst->src[0]; // src
- ggml_tensor* src1 = dst->src[1]; // index
+void ggml_cann_set_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0]; // src
+ ggml_tensor * src1 = dst->src[1]; // index
switch (dst->type) {
- case GGML_TYPE_F32: {
- aclnn_index_copy_4d(ctx, src0->data, src0->ne, src0->nb,
- dst->data, dst->ne, dst->nb,
- src1, dst->type);
- break;
- }
- case GGML_TYPE_F16: {
- aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
- ggml_cann_pool_alloc src_buffer_allocator(
- ctx.pool(), ggml_nelements(src0) * sizeof(uint16_t));
- void* src_trans_buffer = src_buffer_allocator.get();
- size_t src_trans_nb[GGML_MAX_DIMS];
- src_trans_nb[0] = sizeof(uint16_t);
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
- src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
+ case GGML_TYPE_F32:
+ {
+ aclnn_index_copy_4d(ctx, src0->data, src0->ne, src0->nb, dst->data, dst->ne, dst->nb, src1, dst->type);
+ break;
+ }
+ case GGML_TYPE_F16:
+ {
+ aclTensor * acl_src0 = ggml_cann_create_tensor(src0);
+ ggml_cann_pool_alloc src_buffer_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(uint16_t));
+ void * src_trans_buffer = src_buffer_allocator.get();
+ size_t src_trans_nb[GGML_MAX_DIMS];
+ src_trans_nb[0] = sizeof(uint16_t);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
+ }
+ aclTensor * src_trans_tensor = ggml_cann_create_tensor(
+ src_trans_buffer, ACL_FLOAT16, ggml_type_size(dst->type), src0->ne, src_trans_nb, GGML_MAX_DIMS);
+ aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type));
+ aclnn_index_copy_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb, dst->data, dst->ne, dst->nb, src1,
+ dst->type);
+ ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor);
+ break;
}
- aclTensor* src_trans_tensor = ggml_cann_create_tensor(
- src_trans_buffer, ACL_FLOAT16, ggml_type_size(dst->type),
- src0->ne, src_trans_nb, GGML_MAX_DIMS);
- aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type));
- aclnn_index_copy_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb,
- dst->data, dst->ne, dst->nb,
- src1, dst->type);
- ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor);
- break;
- }
default:
GGML_ABORT("Unsupported tensor type for GGML_OP_SET_ROWS");
break;
* @param repeats The number of times each element will be repeated.
* @param output_size The size of the output tensor.
*/
-static void aclnn_repeat_interleave(ggml_backend_cann_context& ctx,
- aclTensor* acl_src, aclTensor* acl_dst,
- int64_t dim, int64_t repeats,
- int64_t output_size) {
- GGML_CANN_CALL_ACLNN_OP(ctx, RepeatInterleaveIntWithDim, acl_src, repeats, dim,
- output_size, acl_dst);
+static void aclnn_repeat_interleave(ggml_backend_cann_context & ctx,
+ aclTensor * acl_src,
+ aclTensor * acl_dst,
+ int64_t dim,
+ int64_t repeats,
+ int64_t output_size) {
+ GGML_CANN_CALL_ACLNN_OP(ctx, RepeatInterleaveIntWithDim, acl_src, repeats, dim, output_size, acl_dst);
}
/**
* @param dst The destination tensor where the result of the matrix
* multiplication will be stored.
*/
-static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
- ggml_tensor* dst) {
- ggml_tensor* weight = dst->src[0]; // weight
- ggml_tensor* input = dst->src[1]; // input
+static void ggml_cann_mat_mul_fp(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * weight = dst->src[0]; // weight
+ ggml_tensor * input = dst->src[1]; // input
// when weight ne2 or ne3 is 1, aclnnMatmulGetWorkspaceSize will auto
// broadcast, when weight ne2 or ne3 is not 1, weight need repeat.
}
}
- aclTensor* acl_input_tensor =
- ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
- int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0],
- bcast_weight_ne[2], bcast_weight_ne[3],
- bcast_weight_ne[4], bcast_weight_ne[5]};
- size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
- bcast_weight_nb[2], bcast_weight_nb[3],
- bcast_weight_nb[4], bcast_weight_nb[5]};
- aclTensor* acl_weight_tensor;
+ aclTensor * acl_input_tensor = ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
+ int64_t transpose_ne[] = { bcast_weight_ne[1], bcast_weight_ne[0], bcast_weight_ne[2],
+ bcast_weight_ne[3], bcast_weight_ne[4], bcast_weight_ne[5] };
+ size_t transpose_nb[] = { bcast_weight_nb[1], bcast_weight_nb[0], bcast_weight_nb[2],
+ bcast_weight_nb[3], bcast_weight_nb[4], bcast_weight_nb[5] };
+ aclTensor * acl_weight_tensor;
// Only check env once.
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
if (weight_to_nz && is_matmul_weight(weight)) {
- acl_weight_tensor =
- ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_FRACTAL_NZ);
+ acl_weight_tensor = ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_FRACTAL_NZ);
} else {
- acl_weight_tensor =
- ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND);
+ acl_weight_tensor = ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND);
}
- aclTensor* acl_dst =
- ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
switch (n_dims) {
case 2:
* @param dst The destination tensor where the result of the matrix
* multiplication will be stored.
*/
-static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
- ggml_tensor* dst,
- const enum ggml_type type) {
- ggml_tensor* src0 = dst->src[0]; // weight
- ggml_tensor* src1 = dst->src[1]; // input
+static void ggml_cann_mul_mat_quant(ggml_backend_cann_context & ctx, ggml_tensor * dst, const enum ggml_type type) {
+ ggml_tensor * src0 = dst->src[0]; // weight
+ ggml_tensor * src1 = dst->src[1]; // input
// The shape of the weight is NCHW.
// Matrix multiplication uses HW dims.
} else {
GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
}
- float weight_nb[] = {src0->ne[0] * weight_elem_size, weight_elem_size};
+ float weight_nb[] = { src0->ne[0] * weight_elem_size, weight_elem_size };
size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size;
- size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
+ size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
// scale stored at the end of weight. Also need transpose.
size_t scale_elem_size = sizeof(uint16_t);
- size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
- scale_elem_size};
- size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
- char* scale_offset = (char*)src0->data + weight_size;
+ size_t scale_nb[] = { src0->ne[0] / QK8_0 * scale_elem_size, scale_elem_size };
+ size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
+ char * scale_offset = (char *) src0->data + weight_size;
// input
- size_t input_elem_size = sizeof(uint16_t);
- int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
- size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size};
- size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size;
+ size_t input_elem_size = sizeof(uint16_t);
+ int64_t input_ne[] = { src1->ne[0], src1->ne[1] };
+ size_t input_nb[] = { input_elem_size, input_ne[0] * input_elem_size };
+ size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size;
ggml_cann_pool_alloc input_alloctor(ctx.pool());
- void* input_buffer = src1->data;
+ void * input_buffer = src1->data;
// case in
if (src1->type != GGML_TYPE_F16) {
- aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
- input_buffer =
- input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
+ aclTensor * acl_src1_tensor = ggml_cann_create_tensor(src1);
+ input_buffer = input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
- int64_t* input_cast_ne = src1->ne;
- size_t input_cast_nb[GGML_MAX_DIMS];
+ int64_t * input_cast_ne = src1->ne;
+ size_t input_cast_nb[GGML_MAX_DIMS];
input_cast_nb[0] = sizeof(uint16_t);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
input_cast_nb[i] = input_cast_nb[i - 1] * input_cast_ne[i - 1];
}
- aclTensor* acl_input_tensor = ggml_cann_create_tensor(
- input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne,
- input_cast_nb, GGML_MAX_DIMS);
+ aclTensor * acl_input_tensor = ggml_cann_create_tensor(input_buffer, ACL_FLOAT16, input_elem_size,
+ input_cast_ne, input_cast_nb, GGML_MAX_DIMS);
aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
ggml_cann_release_resources(ctx, acl_input_tensor, acl_src1_tensor);
}
// output
- size_t output_elem_size = sizeof(uint16_t);
- size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size};
+ size_t output_elem_size = sizeof(uint16_t);
+ size_t output_nb[] = { output_elem_size, dst->ne[0] * output_elem_size };
ggml_cann_pool_alloc output_allocator(ctx.pool());
- void* output_buffer =
- output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
- size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
+ void * output_buffer = output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
+ size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
// aclnn
- int64_t max_elem_size = 65535;
- int64_t split_size = (src0->ne[1] / max_elem_size) + 1;
+ int64_t max_elem_size = 65535;
+ int64_t split_size = (src0->ne[1] / max_elem_size) + 1;
ggml_cann_pool_alloc workspace_allocator(ctx.pool());
for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) {
for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) {
int64_t batch1 = (n1 * src1->ne[2]) + c1;
int64_t batch0 = (n0 * src0->ne[2]) + c0;
- aclTensor* acl_input_tensor = ggml_cann_create_tensor(
- (char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
- input_elem_size, input_ne, input_nb, 2);
+ aclTensor * acl_input_tensor = ggml_cann_create_tensor((char *) input_buffer + batch1 * input_stride,
+ ACL_FLOAT16, input_elem_size, input_ne, input_nb, 2);
// first split
int64_t weight_ne_offset = 0;
- int64_t weight_ne[2] = {
- max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size,
- src0->ne[0]};
- int64_t scale_ne_offset = 0;
- int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0};
+ int64_t weight_ne[2] = { max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, src0->ne[0] };
+ int64_t scale_ne_offset = 0;
+ int64_t scale_ne[2] = { weight_ne[0], weight_ne[1] / QK8_0 };
int64_t output_ne_offset = 0;
- int64_t output_ne[2] = {weight_ne[0], dst->ne[1]};
-
- aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
- (char*)src0->data + batch0 * weight_stride,
- ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
- weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
- aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
- scale_offset + batch0 * scale_stride, ACL_FLOAT16,
- scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
- scale_ne_offset);
- aclTensor* acl_output_tensor = ggml_cann_create_tensor(
- (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
- output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
- output_ne_offset);
+ int64_t output_ne[2] = { weight_ne[0], dst->ne[1] };
+
+ aclTensor * acl_weight_tensor =
+ ggml_cann_create_tensor((char *) src0->data + batch0 * weight_stride, ggml_cann_type_mapping(type),
+ weight_elem_size, weight_ne, weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
+ aclTensor * acl_scale_tensor =
+ ggml_cann_create_tensor(scale_offset + batch0 * scale_stride, ACL_FLOAT16, scale_elem_size, scale_ne,
+ scale_nb, 2, ACL_FORMAT_ND, scale_ne_offset);
+ aclTensor * acl_output_tensor =
+ ggml_cann_create_tensor((char *) output_buffer + batch1 * output_stride, ACL_FLOAT16, output_elem_size,
+ output_ne, output_nb, 2, ACL_FORMAT_ND, output_ne_offset);
int64_t antiquantGroupSize = 0;
if (src0->ne[0] > QK8_0) {
antiquantGroupSize = QK8_0;
}
- GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor,
- acl_weight_tensor, acl_scale_tensor, nullptr,
- nullptr, nullptr, nullptr, antiquantGroupSize,
- acl_output_tensor);
+ GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor, acl_weight_tensor,
+ acl_scale_tensor, nullptr, nullptr, nullptr, nullptr, antiquantGroupSize,
+ acl_output_tensor);
ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_output_tensor);
// other splits
for (int64_t split = 1; split < split_size; split++) {
- weight_ne_offset +=
- weight_elem_size * weight_ne[0] * weight_ne[1];
- weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1]
- ? src0->ne[1] - (max_elem_size * split)
- : max_elem_size;
+ weight_ne_offset += weight_elem_size * weight_ne[0] * weight_ne[1];
+ weight_ne[0] =
+ max_elem_size * (split + 1) > src0->ne[1] ? src0->ne[1] - (max_elem_size * split) : max_elem_size;
scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1];
scale_ne[0] = weight_ne[0];
- output_ne_offset +=
- output_elem_size * output_ne[0] * output_ne[1];
+ output_ne_offset += output_elem_size * output_ne[0] * output_ne[1];
output_ne[0] = weight_ne[0];
- acl_weight_tensor = ggml_cann_create_tensor(
- (char*)src0->data + batch0 * weight_stride,
- ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
- weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
- acl_scale_tensor = ggml_cann_create_tensor(
- scale_offset + batch0 * scale_stride, ACL_FLOAT16,
- scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
- scale_ne_offset);
- acl_output_tensor = ggml_cann_create_tensor(
- (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
- output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
- output_ne_offset);
- GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor,
- acl_weight_tensor, acl_scale_tensor, nullptr,
- nullptr, nullptr, nullptr, antiquantGroupSize,
- acl_output_tensor);
+ acl_weight_tensor =
+ ggml_cann_create_tensor((char *) src0->data + batch0 * weight_stride, ggml_cann_type_mapping(type),
+ weight_elem_size, weight_ne, weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
+ acl_scale_tensor =
+ ggml_cann_create_tensor(scale_offset + batch0 * scale_stride, ACL_FLOAT16, scale_elem_size,
+ scale_ne, scale_nb, 2, ACL_FORMAT_ND, scale_ne_offset);
+ acl_output_tensor =
+ ggml_cann_create_tensor((char *) output_buffer + batch1 * output_stride, ACL_FLOAT16,
+ output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND, output_ne_offset);
+ GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor, acl_weight_tensor,
+ acl_scale_tensor, nullptr, nullptr, nullptr, nullptr, antiquantGroupSize,
+ acl_output_tensor);
ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_output_tensor);
}
// cast out
if (dst->type != GGML_TYPE_F16) {
- int64_t* output_cast_ne = dst->ne;
- size_t output_cast_nb[GGML_MAX_DIMS];
+ int64_t * output_cast_ne = dst->ne;
+ size_t output_cast_nb[GGML_MAX_DIMS];
output_cast_nb[0] = sizeof(uint16_t);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
}
- aclTensor* acl_output_tensor = ggml_cann_create_tensor(
- output_buffer, ACL_FLOAT16, output_elem_size, output_cast_ne,
- output_cast_nb, GGML_MAX_DIMS);
- aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
+ aclTensor * acl_output_tensor = ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, output_elem_size,
+ output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
+ aclTensor * acl_dst_tensor = ggml_cann_create_tensor(dst);
aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
ggml_cann_release_resources(ctx, acl_output_tensor, acl_dst_tensor);
}
}
-void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+void ggml_cann_mul_mat(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
const enum ggml_type type = dst->src[0]->type;
switch (type) {
case GGML_TYPE_F32:
* @param dims An array specifying the dimensions along which elements are
* shifted.
*/
-static void aclnn_roll(ggml_backend_cann_context& ctx, aclTensor* acl_src,
- aclTensor* acl_dst, int64_t* shifts, int64_t* dims) {
- aclIntArray* acl_shifts = aclCreateIntArray(shifts, 1);
- aclIntArray* acl_dims = aclCreateIntArray(dims, 1);
+static void aclnn_roll(ggml_backend_cann_context & ctx,
+ aclTensor * acl_src,
+ aclTensor * acl_dst,
+ int64_t * shifts,
+ int64_t * dims) {
+ aclIntArray * acl_shifts = aclCreateIntArray(shifts, 1);
+ aclIntArray * acl_dims = aclCreateIntArray(dims, 1);
GGML_CANN_CALL_ACLNN_OP(ctx, Roll, acl_src, acl_shifts, acl_dims, acl_dst);
ggml_cann_release_resources(ctx, acl_shifts, acl_dims);
}
* @param index_num The number of positions specified in the index array.
* @param value The scalar value used to fill the specified positions.
*/
-static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx,
- aclTensor* acl_src, int64_t dim,
- int64_t* index, int64_t index_num,
- float value) {
- aclIntArray* acl_index = aclCreateIntArray(index, index_num);
- aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
+static void aclnn_index_fill_tensor(ggml_backend_cann_context & ctx,
+ aclTensor * acl_src,
+ int64_t dim,
+ int64_t * index,
+ int64_t index_num,
+ float value) {
+ aclIntArray * acl_index = aclCreateIntArray(index, index_num);
+ aclScalar * acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceIndexFillTensor, acl_src, dim, acl_index, acl_value);
ggml_cann_release_resources(ctx, acl_index, acl_value);
}
* @param is_neox Whether to use Neox-style repeat strategy
* (dim expansion vs repeat_interleave).
*/
-static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
- float* corr_dims, float ext_factor,
- float theta_scale, float freq_scale,
- float attn_factor, bool is_neox) {
- ggml_tensor* src0 = dst->src[0]; // input
- ggml_tensor* src1 = dst->src[1]; // position
- ggml_tensor* src2 = dst->src[2]; // freq_factors
-
- if(src2 == nullptr && ctx.rope_cache.cached
- && ctx.rope_cache.ext_factor == ext_factor
- && ctx.rope_cache.theta_scale == theta_scale
- && ctx.rope_cache.freq_scale == freq_scale
- && ctx.rope_cache.attn_factor == attn_factor
- && ctx.rope_cache.is_neox == is_neox) {
+static void aclnn_cache_init(ggml_backend_cann_context & ctx,
+ ggml_tensor * dst,
+ float * corr_dims,
+ float ext_factor,
+ float theta_scale,
+ float freq_scale,
+ float attn_factor,
+ bool is_neox) {
+ ggml_tensor * src0 = dst->src[0]; // input
+ ggml_tensor * src1 = dst->src[1]; // position
+ ggml_tensor * src2 = dst->src[2]; // freq_factors
+
+ if (src2 == nullptr && ctx.rope_cache.cached && ctx.rope_cache.ext_factor == ext_factor &&
+ ctx.rope_cache.theta_scale == theta_scale && ctx.rope_cache.freq_scale == freq_scale &&
+ ctx.rope_cache.attn_factor == attn_factor && ctx.rope_cache.is_neox == is_neox) {
// use cache.
return;
}
int64_t theta_scale_length = src0->ne[0] / 2;
- int64_t theta_scale_ne[] = {theta_scale_length, 1, 1, 1};
- size_t theta_scale_nb[] = {sizeof(float), sizeof(float), sizeof(float),
- theta_scale_length * sizeof(float)};
+ int64_t theta_scale_ne[] = { theta_scale_length, 1, 1, 1 };
+ size_t theta_scale_nb[] = { sizeof(float), sizeof(float), sizeof(float), theta_scale_length * sizeof(float) };
GGML_ASSERT(src1->type == GGML_TYPE_I32);
int64_t position_length = src1->ne[0];
- int64_t position_ne[] = {1, 1, position_length, 1};
- size_t position_nb[] = {sizeof(int32_t), sizeof(int32_t), sizeof(int32_t),
- sizeof(int32_t) * position_length};
+ int64_t position_ne[] = { 1, 1, position_length, 1 };
+ size_t position_nb[] = { sizeof(int32_t), sizeof(int32_t), sizeof(int32_t), sizeof(int32_t) * position_length };
- int64_t theta_ne[] = {theta_scale_length, 1, position_length, 1};
- size_t theta_nb[GGML_MAX_DIMS];
+ int64_t theta_ne[] = { theta_scale_length, 1, position_length, 1 };
+ size_t theta_nb[GGML_MAX_DIMS];
theta_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
theta_nb[i] = theta_nb[i - 1] * theta_ne[i - 1];
}
// theta_scale arange, [0,1,...,ne00/2 - 1]
- aclTensor* acl_theta_scale_tensor = nullptr;
+ aclTensor * acl_theta_scale_tensor = nullptr;
// cache theta scale
if (ctx.rope_cache.theta_scale_length != theta_scale_length ||
// theta_scale and freq_scale should not change during the current token inference process,
// so we can directly use == here instead of comparing the absolute difference.
- ctx.rope_cache.theta_scale != theta_scale ||
- ctx.rope_cache.freq_scale != freq_scale) {
-
+ ctx.rope_cache.theta_scale != theta_scale || ctx.rope_cache.freq_scale != freq_scale) {
ctx.rope_cache.theta_scale_length = theta_scale_length;
if (ctx.rope_cache.theta_scale_cache != nullptr) {
ACL_CHECK(aclrtFree(ctx.rope_cache.theta_scale_cache));
}
- ACL_CHECK(aclrtMalloc(&ctx.rope_cache.theta_scale_cache, theta_scale_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST));
+ ACL_CHECK(aclrtMalloc(&ctx.rope_cache.theta_scale_cache, theta_scale_length * sizeof(float),
+ ACL_MEM_MALLOC_HUGE_FIRST));
- acl_theta_scale_tensor =
- ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
- theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
+ acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
+ theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
- float start = 0;
- float step = 1;
- float stop = theta_scale_length;
+ float start = 0;
+ float step = 1;
+ float stop = theta_scale_length;
float n_elements = theta_scale_length;
aclnn_arange(ctx, acl_theta_scale_tensor, start, stop, step, n_elements);
ggml_cann_pool_alloc yarn_ramp_allocator(ctx.pool());
- aclTensor* acl_yarn_ramp_tensor = nullptr;
+ aclTensor * acl_yarn_ramp_tensor = nullptr;
if (ext_factor != 0) {
// -rope_yarn_ramp
// const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
// return MIN(1, MAX(0, y)) - 1;
yarn_ramp_allocator.alloc(theta_scale_length * sizeof(float));
- void* yarn_ramp_buffer = yarn_ramp_allocator.get();
- acl_yarn_ramp_tensor = ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float),
- theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
- float zero_value = 0, one_value = 1;
- float denom_safe_value = MAX(0.001f, corr_dims[1] - corr_dims[0]);
- aclScalar* low = aclCreateScalar(&corr_dims[0], aclDataType::ACL_FLOAT);
- aclScalar* zero = aclCreateScalar(&zero_value, aclDataType::ACL_FLOAT);
- aclScalar* one = aclCreateScalar(&one_value, aclDataType::ACL_FLOAT);
- aclScalar* denom_safe = aclCreateScalar(&denom_safe_value, aclDataType::ACL_FLOAT);
- aclScalar* ext_factor_sc = aclCreateScalar(&ext_factor, aclDataType::ACL_FLOAT);
+ void * yarn_ramp_buffer = yarn_ramp_allocator.get();
+ acl_yarn_ramp_tensor = ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float), theta_scale_ne,
+ theta_scale_nb, GGML_MAX_DIMS);
+ float zero_value = 0, one_value = 1;
+ float denom_safe_value = MAX(0.001f, corr_dims[1] - corr_dims[0]);
+ aclScalar * low = aclCreateScalar(&corr_dims[0], aclDataType::ACL_FLOAT);
+ aclScalar * zero = aclCreateScalar(&zero_value, aclDataType::ACL_FLOAT);
+ aclScalar * one = aclCreateScalar(&one_value, aclDataType::ACL_FLOAT);
+ aclScalar * denom_safe = aclCreateScalar(&denom_safe_value, aclDataType::ACL_FLOAT);
+ aclScalar * ext_factor_sc = aclCreateScalar(&ext_factor, aclDataType::ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, Subs, acl_theta_scale_tensor, low, one, acl_yarn_ramp_tensor);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceDivs, acl_yarn_ramp_tensor, denom_safe);
//
// we cache (freq_scale - freq_scale * ramp_mix + ramp_mix), Considering that the rope_yarn_ramp here is the inverse
// cache freq_scale + (freq_scale - 1) * ramp_mix
- float freq_scale_1 = freq_scale - 1;
- aclScalar* freq_scale_sc = aclCreateScalar(&freq_scale, aclDataType::ACL_FLOAT);
- aclScalar* freq_scale_1_sc = aclCreateScalar(&freq_scale_1, aclDataType::ACL_FLOAT);
+ float freq_scale_1 = freq_scale - 1;
+ aclScalar * freq_scale_sc = aclCreateScalar(&freq_scale, aclDataType::ACL_FLOAT);
+ aclScalar * freq_scale_1_sc = aclCreateScalar(&freq_scale_1, aclDataType::ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_yarn_ramp_tensor, freq_scale_1_sc);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdds, acl_yarn_ramp_tensor, freq_scale_sc, one);
}
// power
- aclScalar* acl_theta_scale = aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT);
- GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, acl_theta_scale, acl_theta_scale_tensor,
- acl_theta_scale_tensor);
+ aclScalar * acl_theta_scale = aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT);
+ GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, acl_theta_scale, acl_theta_scale_tensor, acl_theta_scale_tensor);
if (ext_factor != 0) {
aclnn_mul(ctx, acl_theta_scale_tensor, acl_yarn_ramp_tensor);
ggml_cann_release_resources(ctx, acl_yarn_ramp_tensor, acl_theta_scale);
} else {
// use cache
- acl_theta_scale_tensor =
- ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
- theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
+ acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
+ theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
}
ggml_cann_pool_alloc freq_fac_res_allocator(ctx.pool());
// freq_factors
if (src2) {
freq_fac_res_allocator.alloc(theta_scale_length * sizeof(float));
- void* freq_fac_res_ptr = freq_fac_res_allocator.get();
- aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor(
- src2->data, ggml_cann_type_mapping(src2->type),
- ggml_type_size(src2->type), theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
- aclTensor* acl_freq_fac_res_tensor = ggml_cann_create_tensor(
- freq_fac_res_ptr, ACL_FLOAT, sizeof(float),
- theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
+ void * freq_fac_res_ptr = freq_fac_res_allocator.get();
+ aclTensor * acl_freq_factors_tensor =
+ ggml_cann_create_tensor(src2->data, ggml_cann_type_mapping(src2->type), ggml_type_size(src2->type),
+ theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
+ aclTensor * acl_freq_fac_res_tensor = ggml_cann_create_tensor(freq_fac_res_ptr, ACL_FLOAT, sizeof(float),
+ theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
aclnn_div(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor, acl_freq_fac_res_tensor);
std::swap(acl_theta_scale_tensor, acl_freq_fac_res_tensor);
ggml_cann_release_resources(ctx, acl_freq_factors_tensor, acl_freq_fac_res_tensor);
ACL_CHECK(aclrtFree(ctx.rope_cache.cos_cache));
}
int64_t repeat_theta_length = theta_scale_length * position_length * 2;
- ACL_CHECK(aclrtMalloc(&ctx.rope_cache.sin_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST));
- ACL_CHECK(aclrtMalloc(&ctx.rope_cache.cos_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST));
+ ACL_CHECK(
+ aclrtMalloc(&ctx.rope_cache.sin_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST));
+ ACL_CHECK(
+ aclrtMalloc(&ctx.rope_cache.cos_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST));
}
// position
- aclTensor* acl_position_tensor = ggml_cann_create_tensor(
- src1->data, ggml_cann_type_mapping(src1->type),
- ggml_type_size(src1->type), position_ne, position_nb, GGML_MAX_DIMS);
+ aclTensor * acl_position_tensor =
+ ggml_cann_create_tensor(src1->data, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type), position_ne,
+ position_nb, GGML_MAX_DIMS);
// power * position
- int64_t theta_length = theta_scale_length * position_length;
- ggml_cann_pool_alloc theta_allocator(ctx.pool(),
- theta_length * sizeof(float));
- void* theta_buffer = theta_allocator.get();
+ int64_t theta_length = theta_scale_length * position_length;
+ ggml_cann_pool_alloc theta_allocator(ctx.pool(), theta_length * sizeof(float));
+ void * theta_buffer = theta_allocator.get();
- aclTensor* acl_theta_tensor =
- ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float),
- theta_ne, theta_nb, GGML_MAX_DIMS);
- aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor,
- acl_theta_tensor);
+ aclTensor * acl_theta_tensor =
+ ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float), theta_ne, theta_nb, GGML_MAX_DIMS);
+ aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor, acl_theta_tensor);
// sin/cos
- ggml_cann_pool_alloc sin_allocator(ctx.pool(),
- theta_length * sizeof(float));
- void* sin_buffer = sin_allocator.get();
- aclTensor* acl_sin_tensor = ggml_cann_create_tensor(
- sin_buffer, ACL_FLOAT, sizeof(float), theta_ne, theta_nb,
- GGML_MAX_DIMS, ACL_FORMAT_ND);
+ ggml_cann_pool_alloc sin_allocator(ctx.pool(), theta_length * sizeof(float));
+ void * sin_buffer = sin_allocator.get();
+ aclTensor * acl_sin_tensor =
+ ggml_cann_create_tensor(sin_buffer, ACL_FLOAT, sizeof(float), theta_ne, theta_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
aclnn_sin(ctx, acl_theta_tensor, acl_sin_tensor);
- ggml_cann_pool_alloc cos_allocator(ctx.pool(),
- theta_length * sizeof(float));
- void* cos_buffer = cos_allocator.get();
- aclTensor* acl_cos_tensor = ggml_cann_create_tensor(
- cos_buffer, ACL_FLOAT, sizeof(float), theta_ne, theta_nb,
- GGML_MAX_DIMS, ACL_FORMAT_ND);
+ ggml_cann_pool_alloc cos_allocator(ctx.pool(), theta_length * sizeof(float));
+ void * cos_buffer = cos_allocator.get();
+ aclTensor * acl_cos_tensor =
+ ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float), theta_ne, theta_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
aclnn_cos(ctx, acl_theta_tensor, acl_cos_tensor);
if (ext_factor != 0) {
aclnn_muls(ctx, acl_cos_tensor, attn_factor, nullptr, true);
}
- int64_t sin_reshape_ne[4] = {src0->ne[0], 1, src0->ne[2], 1};
- size_t sin_reshape_nb[GGML_MAX_DIMS];
+ int64_t sin_reshape_ne[4] = { src0->ne[0], 1, src0->ne[2], 1 };
+ size_t sin_reshape_nb[GGML_MAX_DIMS];
sin_reshape_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1];
}
- aclTensor* acl_sin_repeat_tensor =
- ggml_cann_create_tensor(ctx.rope_cache.sin_cache, ACL_FLOAT, sizeof(float),
- sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
- aclTensor* acl_cos_repeat_tensor =
- ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float),
- sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
+ aclTensor * acl_sin_repeat_tensor = ggml_cann_create_tensor(ctx.rope_cache.sin_cache, ACL_FLOAT, sizeof(float),
+ sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
+ aclTensor * acl_cos_repeat_tensor = ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float),
+ sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
// repeat
if (is_neox) {
- int64_t repeatsArray[] = {1, 1, 1, 2};
+ int64_t repeatsArray[] = { 1, 1, 1, 2 };
aclnn_repeat(ctx, acl_sin_tensor, acl_sin_repeat_tensor, repeatsArray);
aclnn_repeat(ctx, acl_cos_tensor, acl_cos_repeat_tensor, repeatsArray);
} else {
int64_t num_repeats = 2;
- int64_t dim = 3;
+ int64_t dim = 3;
int64_t output_size = theta_scale_length * num_repeats;
- aclnn_repeat_interleave(ctx, acl_sin_tensor, acl_sin_repeat_tensor, dim,
- num_repeats, output_size);
- aclnn_repeat_interleave(ctx, acl_cos_tensor, acl_cos_repeat_tensor, dim,
- num_repeats, output_size);
+ aclnn_repeat_interleave(ctx, acl_sin_tensor, acl_sin_repeat_tensor, dim, num_repeats, output_size);
+ aclnn_repeat_interleave(ctx, acl_cos_tensor, acl_cos_repeat_tensor, dim, num_repeats, output_size);
}
// Other layers use cache except first layer.
- ctx.rope_cache.cached = true;
- ctx.rope_cache.ext_factor = ext_factor;
+ ctx.rope_cache.cached = true;
+ ctx.rope_cache.ext_factor = ext_factor;
ctx.rope_cache.theta_scale = theta_scale;
- ctx.rope_cache.freq_scale = freq_scale;
+ ctx.rope_cache.freq_scale = freq_scale;
ctx.rope_cache.attn_factor = attn_factor;
- ctx.rope_cache.is_neox = is_neox;
+ ctx.rope_cache.is_neox = is_neox;
- ggml_cann_release_resources(ctx, acl_theta_scale_tensor, acl_position_tensor,
- acl_theta_tensor, acl_sin_tensor, acl_sin_repeat_tensor, acl_cos_tensor,
- acl_cos_repeat_tensor);
+ ggml_cann_release_resources(ctx, acl_theta_scale_tensor, acl_position_tensor, acl_theta_tensor, acl_sin_tensor,
+ acl_sin_repeat_tensor, acl_cos_tensor, acl_cos_repeat_tensor);
}
#ifdef __cplusplus
extern "C" {
#endif
-aclnnStatus aclnnRotaryPositionEmbeddingGetWorkspaceSize(
- const aclTensor* x, const aclTensor* cos, const aclTensor* sin,
- int64_t mode, const aclTensor* yOut, uint64_t* workspaceSize,
- aclOpExecutor** executor);
-aclnnStatus aclnnRotaryPositionEmbedding(void* workspace,
- uint64_t workspaceSize,
- aclOpExecutor* executor,
- aclrtStream stream);
+aclnnStatus aclnnRotaryPositionEmbeddingGetWorkspaceSize(const aclTensor * x,
+ const aclTensor * cos,
+ const aclTensor * sin,
+ int64_t mode,
+ const aclTensor * yOut,
+ uint64_t * workspaceSize,
+ aclOpExecutor ** executor);
+aclnnStatus aclnnRotaryPositionEmbedding(void * workspace,
+ uint64_t workspaceSize,
+ aclOpExecutor * executor,
+ aclrtStream stream);
#ifdef __cplusplus
}
#endif
-void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
- ggml_tensor* src0 = dst->src[0]; // input
+void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0]; // input
// param
- float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
// const int n_past = ((int32_t *) dst->op_params)[0];
- const int n_dims = ((int32_t*)dst->op_params)[1];
- const int mode = ((int32_t*)dst->op_params)[2];
+ const int n_dims = ((int32_t *) dst->op_params)[1];
+ const int mode = ((int32_t *) dst->op_params)[2];
// const int n_ctx = ((int32_t *) dst->op_params)[3];
- const int n_ctx_orig = ((int32_t*)dst->op_params)[4];
+ const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
GGML_TENSOR_UNARY_OP_LOCALS
- memcpy(&freq_base, (int32_t*)dst->op_params + 5, sizeof(float));
- memcpy(&freq_scale, (int32_t*)dst->op_params + 6, sizeof(float));
- memcpy(&ext_factor, (int32_t*)dst->op_params + 7, sizeof(float));
- memcpy(&attn_factor, (int32_t*)dst->op_params + 8, sizeof(float));
- memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
- memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
+ memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
+ memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
+ memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
+ memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
+ memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
// TODO: n_dims <= ne0
GGML_ASSERT(n_dims == ne0);
const float theta_scale = powf(freq_base, -2.0f / n_dims);
float corr_dims[2];
- ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast,
- beta_slow, corr_dims);
+ ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
// init ctx.rope_cos/rope_sin cache
- aclnn_cache_init(ctx, dst, corr_dims, ext_factor,
- theta_scale, freq_scale, attn_factor, is_neox);
+ aclnn_cache_init(ctx, dst, corr_dims, ext_factor, theta_scale, freq_scale, attn_factor, is_neox);
- int64_t sin_reshape_ne[4] = {ne00, 1, ne02, 1};
- size_t sin_reshape_nb[GGML_MAX_DIMS];
+ int64_t sin_reshape_ne[4] = { ne00, 1, ne02, 1 };
+ size_t sin_reshape_nb[GGML_MAX_DIMS];
sin_reshape_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1];
}
- aclTensor* acl_sin_reshape_tensor =
- ggml_cann_create_tensor(ctx.rope_cache.sin_cache, ACL_FLOAT, sizeof(float),
- sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
- aclTensor* acl_cos_reshape_tensor =
- ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float),
- sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
+ aclTensor * acl_sin_reshape_tensor = ggml_cann_create_tensor(ctx.rope_cache.sin_cache, ACL_FLOAT, sizeof(float),
+ sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
+ aclTensor * acl_cos_reshape_tensor = ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float),
+ sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
- aclTensor* acl_src = ggml_cann_create_tensor(src0);
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+ aclTensor * acl_src = ggml_cann_create_tensor(src0);
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst);
#ifdef ASCEND_310P
// Special ROPE operation for 310P
// roll input
- void* input_roll_buffer;
- aclTensor* acl_minus_one_tensor;
- void* minus_one_scale_buffer = nullptr;
+ void * input_roll_buffer;
+ aclTensor * acl_minus_one_tensor;
+ void * minus_one_scale_buffer = nullptr;
ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0));
- ggml_cann_pool_alloc minus_one_scale_allocator(
- ctx.pool(), sizeof(float) * src0->ne[0]);
+ ggml_cann_pool_alloc minus_one_scale_allocator(ctx.pool(), sizeof(float) * src0->ne[0]);
if (!is_neox) {
// roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
- input_roll_buffer = roll_allocator.get();
- int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2),
- src0->ne[2], src0->ne[3]};
- size_t input_roll_nb[GGML_MAX_DIMS];
+ input_roll_buffer = roll_allocator.get();
+ int64_t input_roll_ne[4] = { 2, src0->ne[1] * (src0->ne[0] / 2), src0->ne[2], src0->ne[3] };
+ size_t input_roll_nb[GGML_MAX_DIMS];
input_roll_nb[0] = ggml_type_size(src0->type);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1];
}
- aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
- input_roll_buffer, ggml_cann_type_mapping(src0->type),
- ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
- GGML_MAX_DIMS);
- aclTensor* acl_input_tensor = ggml_cann_create_tensor(
- src0->data, ggml_cann_type_mapping(src0->type),
- ggml_type_size(src0->type), input_roll_ne, input_roll_nb,
- GGML_MAX_DIMS);
-
- int64_t shifts[] = {1};
- int64_t dims[] = {3};
+ aclTensor * acl_input_roll_tensor =
+ ggml_cann_create_tensor(input_roll_buffer, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type),
+ input_roll_ne, input_roll_nb, GGML_MAX_DIMS);
+ aclTensor * acl_input_tensor =
+ ggml_cann_create_tensor(src0->data, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type),
+ input_roll_ne, input_roll_nb, GGML_MAX_DIMS);
+
+ int64_t shifts[] = { 1 };
+ int64_t dims[] = { 3 };
aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
ggml_cann_release_resources(ctx, acl_input_roll_tensor, acl_input_tensor);
// init [-1, 1, -1, 1, ...]
minus_one_scale_buffer = minus_one_scale_allocator.get();
- int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
- size_t minus_one_nb[GGML_MAX_DIMS];
+ int64_t minus_one_ne[4] = { src0->ne[0], 1, 1, 1 };
+ size_t minus_one_nb[GGML_MAX_DIMS];
minus_one_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
}
- acl_minus_one_tensor = aclnn_values(
- ctx, minus_one_scale_buffer, sizeof(float) * src0->ne[0],
- minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float), 1);
- int64_t dim = 3;
- int64_t* index = new int64_t[src0->ne[0]];
+ acl_minus_one_tensor = aclnn_values(ctx, minus_one_scale_buffer, sizeof(float) * src0->ne[0], minus_one_ne,
+ GGML_MAX_DIMS, ACL_FLOAT, sizeof(float), 1);
+ int64_t dim = 3;
+ int64_t * index = new int64_t[src0->ne[0]];
for (int i = 0; i < src0->ne[0]; i++) {
index[i] = i / 2 * 2;
}
int64_t index_num = src0->ne[0];
- float value = -1;
- aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index,
- index_num, value);
+ float value = -1;
+ aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index, index_num, value);
} else {
// roll input: [q0,q1,q2,...] ->
// [q_half,q_half+1,...,q_end,q0,q1,...q_half-1]
input_roll_buffer = roll_allocator.get();
- aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor(
- input_roll_buffer, ggml_cann_type_mapping(src0->type),
- ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS);
- aclTensor* acl_input_tensor = ggml_cann_create_tensor(src0);
+ aclTensor * acl_input_roll_tensor =
+ ggml_cann_create_tensor(input_roll_buffer, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type),
+ src0->ne, src0->nb, GGML_MAX_DIMS);
+ aclTensor * acl_input_tensor = ggml_cann_create_tensor(src0);
- int64_t shifts[] = {src0->ne[0] / 2};
- int64_t dims[] = {3};
+ int64_t shifts[] = { src0->ne[0] / 2 };
+ int64_t dims[] = { 3 };
aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
ggml_cann_release_resources(ctx, acl_input_roll_tensor, acl_input_tensor);
// init [-1, -1, -1, 1, 1,1,...]
- minus_one_scale_buffer = minus_one_scale_allocator.get();
- int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
- size_t minus_one_nb[GGML_MAX_DIMS];
+ minus_one_scale_buffer = minus_one_scale_allocator.get();
+ int64_t minus_one_ne[4] = { src0->ne[0], 1, 1, 1 };
+ size_t minus_one_nb[GGML_MAX_DIMS];
minus_one_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
}
- acl_minus_one_tensor = aclnn_values(
- ctx, minus_one_scale_buffer, sizeof(float) * src0->ne[0],
- minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float), 1);
+ acl_minus_one_tensor = aclnn_values(ctx, minus_one_scale_buffer, sizeof(float) * src0->ne[0], minus_one_ne,
+ GGML_MAX_DIMS, ACL_FLOAT, sizeof(float), 1);
// -1 * first half
- int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1};
- size_t first_half_nb[GGML_MAX_DIMS];
+ int64_t first_half_ne[4] = { src0->ne[0] / 2, 1, 1, 1 };
+ size_t first_half_nb[GGML_MAX_DIMS];
first_half_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1];
}
- aclTensor* acl_first_half_tensor = ggml_cann_create_tensor(
- minus_one_scale_buffer, ACL_FLOAT, sizeof(float), first_half_ne,
- first_half_nb, GGML_MAX_DIMS);
- bool inplace = true;
- float scale = -1;
+ aclTensor * acl_first_half_tensor = ggml_cann_create_tensor(minus_one_scale_buffer, ACL_FLOAT, sizeof(float),
+ first_half_ne, first_half_nb, GGML_MAX_DIMS);
+ bool inplace = true;
+ float scale = -1;
aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace);
ggml_cann_release_resources(ctx, acl_first_half_tensor);
}
GGML_ASSERT(n_dims == src0->ne[0]);
// input * scale
- ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(),
- ggml_nbytes(src0));
- void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get();
- size_t input_nb[GGML_MAX_DIMS];
+ ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(), ggml_nbytes(src0));
+ void * input_roll_mul_scale_buffer = roll_mul_scale_allocator.get();
+ size_t input_nb[GGML_MAX_DIMS];
input_nb[0] = ggml_type_size(src0->type);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
input_nb[i] = input_nb[i - 1] * src0->ne[i - 1];
}
- aclTensor* acl_input_roll_mul_scale_tensor = ggml_cann_create_tensor(
- input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type),
- ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
- aclTensor* acl_input_roll_reshape_tensor = ggml_cann_create_tensor(
- input_roll_buffer, ggml_cann_type_mapping(src0->type),
- ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
+ aclTensor * acl_input_roll_mul_scale_tensor =
+ ggml_cann_create_tensor(input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type),
+ ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
+ aclTensor * acl_input_roll_reshape_tensor =
+ ggml_cann_create_tensor(input_roll_buffer, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type),
+ src0->ne, input_nb, GGML_MAX_DIMS);
- aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor,
- acl_input_roll_mul_scale_tensor);
+ aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor, acl_input_roll_mul_scale_tensor);
// output
- void* output_fp32_buffer;
+ void * output_fp32_buffer;
if (src0->type == GGML_TYPE_F32) {
aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor);
- aclnn_mul(ctx, acl_input_roll_mul_scale_tensor,
- acl_sin_reshape_tensor);
+ aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor);
aclnn_add(ctx, acl_src, acl_input_roll_mul_scale_tensor, acl_dst);
// TODO: ne0 != n_dims in mode2
} else if (src0->type == GGML_TYPE_F16) {
for (int i = 1; i < GGML_MAX_DIMS; i++) {
input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1];
}
- ggml_cann_pool_alloc fp32_allocator1(
- ctx.pool(), ggml_nelements(dst) * sizeof(float));
- void* input_fp32_buffer1 = fp32_allocator1.get();
- aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor(
- input_fp32_buffer1, ACL_FLOAT, sizeof(float), dst->ne,
- input_fp32_nb, GGML_MAX_DIMS);
- ggml_cann_pool_alloc fp32_allocator2(
- ctx.pool(), ggml_nelements(dst) * sizeof(float));
- void* input_fp32_buffer2 = fp32_allocator2.get();
- aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor(
- input_fp32_buffer2, ACL_FLOAT, sizeof(float), dst->ne,
- input_fp32_nb, GGML_MAX_DIMS);
-
- ggml_cann_pool_alloc fp32_allocator(
- ctx.pool(), ggml_nelements(dst) * sizeof(float));
- output_fp32_buffer = fp32_allocator.get();
- aclTensor* output_fp32_tensor = ggml_cann_create_tensor(
- output_fp32_buffer, ACL_FLOAT, sizeof(float), dst->ne,
- input_fp32_nb, GGML_MAX_DIMS);
+ ggml_cann_pool_alloc fp32_allocator1(ctx.pool(), ggml_nelements(dst) * sizeof(float));
+ void * input_fp32_buffer1 = fp32_allocator1.get();
+ aclTensor * input_fp32_tensor1 = ggml_cann_create_tensor(input_fp32_buffer1, ACL_FLOAT, sizeof(float), dst->ne,
+ input_fp32_nb, GGML_MAX_DIMS);
+ ggml_cann_pool_alloc fp32_allocator2(ctx.pool(), ggml_nelements(dst) * sizeof(float));
+ void * input_fp32_buffer2 = fp32_allocator2.get();
+ aclTensor * input_fp32_tensor2 = ggml_cann_create_tensor(input_fp32_buffer2, ACL_FLOAT, sizeof(float), dst->ne,
+ input_fp32_nb, GGML_MAX_DIMS);
+
+ ggml_cann_pool_alloc fp32_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(float));
+ output_fp32_buffer = fp32_allocator.get();
+ aclTensor * output_fp32_tensor = ggml_cann_create_tensor(output_fp32_buffer, ACL_FLOAT, sizeof(float), dst->ne,
+ input_fp32_nb, GGML_MAX_DIMS);
aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor, input_fp32_tensor1);
- aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor,
- input_fp32_tensor2);
- aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2,
- output_fp32_tensor);
+ aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor, input_fp32_tensor2);
+ aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2, output_fp32_tensor);
aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16);
- ggml_cann_release_resources(ctx, input_fp32_tensor1, input_fp32_tensor2,
- output_fp32_tensor, acl_sin_reshape_tensor,
- acl_minus_one_tensor, acl_input_roll_mul_scale_tensor,
- acl_input_roll_reshape_tensor, acl_src);
+ ggml_cann_release_resources(ctx, input_fp32_tensor1, input_fp32_tensor2, output_fp32_tensor,
+ acl_sin_reshape_tensor, acl_minus_one_tensor, acl_input_roll_mul_scale_tensor,
+ acl_input_roll_reshape_tensor, acl_src);
}
return;
#endif
int64_t acl_mode = mode == 0 ? 1 : mode;
switch (src0->type) {
- case GGML_TYPE_F32: {
- GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src,
- acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, acl_dst);
- break;
- }
- case GGML_TYPE_F16: {
- ggml_cann_pool_alloc src_trans_allocator(
- ctx.pool(), ggml_nelements(src0) * sizeof(float));
- void* src_trans_buffer = src_trans_allocator.get();
- ggml_cann_pool_alloc dst_trans_allocator(
- ctx.pool(), ggml_nelements(dst) * sizeof(float));
- void* dst_trans_buffer = dst_trans_allocator.get();
-
- size_t src_trans_nb[GGML_MAX_DIMS];
- src_trans_nb[0] = sizeof(float);
- for (int i = 1; i < GGML_MAX_DIMS; i++) {
- src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
+ case GGML_TYPE_F32:
+ {
+ GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src, acl_cos_reshape_tensor,
+ acl_sin_reshape_tensor, acl_mode, acl_dst);
+ break;
}
+ case GGML_TYPE_F16:
+ {
+ ggml_cann_pool_alloc src_trans_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(float));
+ void * src_trans_buffer = src_trans_allocator.get();
+ ggml_cann_pool_alloc dst_trans_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(float));
+ void * dst_trans_buffer = dst_trans_allocator.get();
- aclTensor* acl_src_trans_tensor = ggml_cann_create_tensor(
- src_trans_buffer, ACL_FLOAT, sizeof(float), src0->ne, src_trans_nb,
- GGML_MAX_DIMS);
- aclTensor* acl_dst_trans_tensor = ggml_cann_create_tensor(
- dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne, src_trans_nb,
- GGML_MAX_DIMS);
+ size_t src_trans_nb[GGML_MAX_DIMS];
+ src_trans_nb[0] = sizeof(float);
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
+ }
- aclnn_cast(ctx, acl_src, acl_src_trans_tensor, ACL_FLOAT);
+ aclTensor * acl_src_trans_tensor = ggml_cann_create_tensor(src_trans_buffer, ACL_FLOAT, sizeof(float),
+ src0->ne, src_trans_nb, GGML_MAX_DIMS);
+ aclTensor * acl_dst_trans_tensor = ggml_cann_create_tensor(dst_trans_buffer, ACL_FLOAT, sizeof(float),
+ dst->ne, src_trans_nb, GGML_MAX_DIMS);
- GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor,
- acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode,
- acl_dst_trans_tensor);
+ aclnn_cast(ctx, acl_src, acl_src_trans_tensor, ACL_FLOAT);
- aclnn_cast(ctx, acl_dst_trans_tensor, acl_dst, ACL_FLOAT16);
+ GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor, acl_cos_reshape_tensor,
+ acl_sin_reshape_tensor, acl_mode, acl_dst_trans_tensor);
- ggml_cann_release_resources(ctx, acl_src_trans_tensor,
- acl_dst_trans_tensor);
- break;
- }
+ aclnn_cast(ctx, acl_dst_trans_tensor, acl_dst, ACL_FLOAT16);
+
+ ggml_cann_release_resources(ctx, acl_src_trans_tensor, acl_dst_trans_tensor);
+ break;
+ }
default:
GGML_ABORT("Unsupported tensor type for GGML_OP_ROPE");
break;
}
- ggml_cann_release_resources(ctx, acl_cos_reshape_tensor,
- acl_sin_reshape_tensor, acl_src, acl_dst);
+ ggml_cann_release_resources(ctx, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_src, acl_dst);
}
-
- void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst){
+void ggml_cann_argmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
ggml_tensor * src0 = dst->src[0];
- aclTensor* acl_src = ggml_cann_create_tensor(src0);
- aclTensor* acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3);
+ aclTensor * acl_src = ggml_cann_create_tensor(src0);
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3);
GGML_CANN_CALL_ACLNN_OP(ctx, ArgMax, acl_src, 3, false, acl_dst);
ggml_cann_release_resources(ctx, acl_src, acl_dst);
}
-void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst){
+void ggml_cann_conv_transpose_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
ggml_tensor * src0 = dst->src[0];
ggml_tensor * src1 = dst->src[1];
// stride
- int64_t s0 = ((const int32_t*)(dst->op_params))[0];
+ int64_t s0 = ((const int32_t *) (dst->op_params))[0];
- aclTensor* acl_input = ggml_cann_create_tensor(src1, src1->ne, src1->nb, 3, ACL_FORMAT_NCL);
- aclTensor* acl_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3, ACL_FORMAT_NCL);
- aclTensor* acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3, ACL_FORMAT_NCL);
+ aclTensor * acl_input = ggml_cann_create_tensor(src1, src1->ne, src1->nb, 3, ACL_FORMAT_NCL);
+ aclTensor * acl_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3, ACL_FORMAT_NCL);
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3, ACL_FORMAT_NCL);
int64_t strideVal[1];
- strideVal[0] = s0;
- aclIntArray *stride = aclCreateIntArray(strideVal, 1);
- int64_t paddingVal[] = {0};
- aclIntArray *padding = aclCreateIntArray(paddingVal, 1);
- int64_t dilationVal[] = {1};
- aclIntArray *dilation = aclCreateIntArray(dilationVal, 1);
- int8_t cubeMathType = 0;
+ strideVal[0] = s0;
+ aclIntArray * stride = aclCreateIntArray(strideVal, 1);
+ int64_t paddingVal[] = { 0 };
+ aclIntArray * padding = aclCreateIntArray(paddingVal, 1);
+ int64_t dilationVal[] = { 1 };
+ aclIntArray * dilation = aclCreateIntArray(dilationVal, 1);
+ int8_t cubeMathType = 0;
#ifdef ASCEND_310P
cubeMathType = 1;
#endif
- GGML_CANN_CALL_ACLNN_OP(ctx, Convolution, acl_input, acl_weight, nullptr, stride,
- padding, dilation, true, padding, 1, acl_dst, cubeMathType);
+ GGML_CANN_CALL_ACLNN_OP(ctx, Convolution, acl_input, acl_weight, nullptr, stride, padding, dilation, true, padding,
+ 1, acl_dst, cubeMathType);
ggml_cann_release_resources(ctx, acl_weight, acl_dst, stride, padding, dilation);
}
-void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst){
+void ggml_cann_elu(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
ggml_tensor * src0 = dst->src[0];
- aclTensor* acl_input = ggml_cann_create_tensor(src0);
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+ aclTensor * acl_input = ggml_cann_create_tensor(src0);
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst);
- float alphaValue = 1.0f;
- aclScalar* alpha = nullptr;
- alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
+ float alphaValue = 1.0f;
+ aclScalar * alpha = nullptr;
+ alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
- GGML_CANN_CALL_ACLNN_OP(ctx, Elu, acl_input, alpha, alpha, alpha,
- acl_dst);
+ GGML_CANN_CALL_ACLNN_OP(ctx, Elu, acl_input, alpha, alpha, alpha, acl_dst);
ggml_cann_release_resources(ctx, acl_input, acl_dst, alpha);
}
-void ggml_cann_mean(ggml_backend_cann_context& ctx, ggml_tensor* dst){
+void ggml_cann_mean(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
ggml_tensor * src0 = dst->src[0];
- aclTensor* acl_src = ggml_cann_create_tensor(src0);
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+ aclTensor * acl_src = ggml_cann_create_tensor(src0);
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst);
- int64_t reduceDimValue[] = {3};
- aclIntArray* reduceDim = aclCreateIntArray(reduceDimValue, 1);
- bool keepDim = true;
+ int64_t reduceDimValue[] = { 3 };
+ aclIntArray * reduceDim = aclCreateIntArray(reduceDimValue, 1);
+ bool keepDim = true;
GGML_CANN_CALL_ACLNN_OP(ctx, Mean, acl_src, reduceDim, keepDim, ACL_FLOAT, acl_dst);
ggml_cann_release_resources(ctx, acl_src, acl_dst, reduceDim);
}
-void ggml_cann_pad_reflect_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst){
- ggml_tensor * src0 = dst->src[0];
- int32_t *opts = (int32_t *) dst->op_params;
- int64_t paddingsArray[2] = {opts[0], opts[1]};
- aclIntArray* paddings = aclCreateIntArray(paddingsArray, 2);
+void ggml_cann_pad_reflect_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0];
+ int32_t * opts = (int32_t *) dst->op_params;
+ int64_t paddingsArray[2] = { opts[0], opts[1] };
+ aclIntArray * paddings = aclCreateIntArray(paddingsArray, 2);
for (int64_t i = 0; i < src0->ne[3]; i++) {
- aclTensor* acl_src = ggml_cann_create_tensor(
- (char*)src0->data + i * src0->ne[3],
- ggml_cann_type_mapping(src0->type), ggml_element_size(src0),
- src0->ne, src0->nb, 3);
+ aclTensor * acl_src =
+ ggml_cann_create_tensor((char *) src0->data + i * src0->ne[3], ggml_cann_type_mapping(src0->type),
+ ggml_element_size(src0), src0->ne, src0->nb, 3);
- aclTensor* acl_dst = ggml_cann_create_tensor(
- (char*)dst->data + i * src0->ne[3],
- ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
- dst->ne, dst->nb, 3);
+ aclTensor * acl_dst =
+ ggml_cann_create_tensor((char *) dst->data + i * src0->ne[3], ggml_cann_type_mapping(dst->type),
+ ggml_element_size(dst), dst->ne, dst->nb, 3);
- GGML_CANN_CALL_ACLNN_OP(ctx, ReflectionPad1d, acl_src, paddings, acl_dst);
+ GGML_CANN_CALL_ACLNN_OP(ctx, ReflectionPad1d, acl_src, paddings, acl_dst);
- ggml_cann_release_resources(ctx, acl_src, acl_dst);
+ ggml_cann_release_resources(ctx, acl_src, acl_dst);
}
ggml_cann_release_resources(ctx, paddings);
}
-void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst){
+void ggml_cann_count_equal(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
ggml_tensor * src0 = dst->src[0];
ggml_tensor * src1 = dst->src[1];
- aclTensor* acl_self = ggml_cann_create_tensor(src0);
- aclTensor* acl_other = ggml_cann_create_tensor(src1);
+ aclTensor * acl_self = ggml_cann_create_tensor(src0);
+ aclTensor * acl_other = ggml_cann_create_tensor(src1);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceEqTensor, acl_self, acl_other);
ggml_cann_release_resources(ctx, acl_self, acl_other);
}
-void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst){
+void ggml_cann_step(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
ggml_tensor * src0 = dst->src[0];
- aclTensor* acl_src = ggml_cann_create_tensor(src0);
- aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+ aclTensor * acl_src = ggml_cann_create_tensor(src0);
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst);
- float alphaValue = 0.0f;
- aclScalar* alpha = nullptr;
- alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
+ float alphaValue = 0.0f;
+ aclScalar * alpha = nullptr;
+ alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, GtScalar, acl_src, alpha, acl_dst);
* @note This function assumes floating-point data types and is designed for
* MoE architectures, possibly involving sparse expert routing.
*/
-static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
//dst [M, K, N, 1]
ggml_tensor * src0 = dst->src[0]; //src0 [D, M, A, 1] -> [D, M, K, 1]
ggml_tensor * src1 = dst->src[1]; //src1 [D, B, N, 1], B = K or B = 1 -> [D, 1, K, 1]
GGML_ASSERT(batch == ids->ne[1]);
ggml_cann_pool_alloc export_allocator(ctx.pool(), src0->ne[0] * src0->ne[1] * ids->ne[0] * ggml_element_size(src0));
- void* export_ptr = export_allocator.get();
+ void * export_ptr = export_allocator.get();
for (int64_t i = 0; i < batch; i++) {
- aclTensor *select_index = ggml_cann_create_tensor(ids, ids->ne, ids->nb, 1, ACL_FORMAT_ND, i * ids->nb[1]);
- aclTensor *export_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3);
+ aclTensor * select_index = ggml_cann_create_tensor(ids, ids->ne, ids->nb, 1, ACL_FORMAT_ND, i * ids->nb[1]);
+ aclTensor * export_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3);
- int64_t select_export_ne[] = {src0->ne[0], src0->ne[1], ids->ne[0]};
- size_t select_export_nb[3];
+ int64_t select_export_ne[] = { src0->ne[0], src0->ne[1], ids->ne[0] };
+ size_t select_export_nb[3];
select_export_nb[0] = src0->nb[0];
- for (int k = 1;k < 3; k++) {
- select_export_nb[k] = select_export_nb[k-1] * select_export_ne[k-1];
+ for (int k = 1; k < 3; k++) {
+ select_export_nb[k] = select_export_nb[k - 1] * select_export_ne[k - 1];
}
- aclTensor *select_export = ggml_cann_create_tensor(export_ptr, ggml_cann_type_mapping(src0->type), ggml_element_size(src0), select_export_ne, select_export_nb, 3);
+ aclTensor * select_export =
+ ggml_cann_create_tensor(export_ptr, ggml_cann_type_mapping(src0->type), ggml_element_size(src0),
+ select_export_ne, select_export_nb, 3);
GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, export_weight, 0, select_index, select_export);
- int64_t select_transpose_ne[] = {select_export_ne[1], select_export_ne[0], select_export_ne[2]};
- size_t select_transpose_nb[] = {select_export_nb[1], select_export_nb[0], select_export_nb[2]};
- aclTensor *select_export_transpose = ggml_cann_create_tensor(export_ptr, ggml_cann_type_mapping(src0->type), ggml_element_size(src0), select_transpose_ne, select_transpose_nb, 3);
+ int64_t select_transpose_ne[] = { select_export_ne[1], select_export_ne[0], select_export_ne[2] };
+ size_t select_transpose_nb[] = { select_export_nb[1], select_export_nb[0], select_export_nb[2] };
+ aclTensor * select_export_transpose =
+ ggml_cann_create_tensor(export_ptr, ggml_cann_type_mapping(src0->type), ggml_element_size(src0),
+ select_transpose_ne, select_transpose_nb, 3);
- int64_t active_tensor_ne[] = {src1->ne[0], 1, src1->ne[1]};
- size_t active_tensor_nb[] = {src1->nb[0], src1->nb[1], src1->nb[1]};
- aclTensor *active_tensor = ggml_cann_create_tensor(src1, active_tensor_ne, active_tensor_nb, 3, ACL_FORMAT_ND, i * src1->nb[2]);
+ int64_t active_tensor_ne[] = { src1->ne[0], 1, src1->ne[1] };
+ size_t active_tensor_nb[] = { src1->nb[0], src1->nb[1], src1->nb[1] };
+ aclTensor * active_tensor =
+ ggml_cann_create_tensor(src1, active_tensor_ne, active_tensor_nb, 3, ACL_FORMAT_ND, i * src1->nb[2]);
- int64_t dst_ne[] = {dst->ne[0], 1, dst->ne[1]};
- size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[1]};
- aclTensor *acl_dst = ggml_cann_create_tensor(dst, dst_ne,dst_nb, 3, ACL_FORMAT_ND, i * dst->nb[2]);
+ int64_t dst_ne[] = { dst->ne[0], 1, dst->ne[1] };
+ size_t dst_nb[] = { dst->nb[0], dst->nb[1], dst->nb[1] };
+ aclTensor * acl_dst = ggml_cann_create_tensor(dst, dst_ne, dst_nb, 3, ACL_FORMAT_ND, i * dst->nb[2]);
GGML_CANN_CALL_ACLNN_OP(ctx, BatchMatMul, active_tensor, select_export_transpose, acl_dst, 2);
- ggml_cann_release_resources(ctx, select_index, export_weight, select_export, active_tensor, acl_dst, select_export_transpose);
+ ggml_cann_release_resources(ctx, select_index, export_weight, select_export, active_tensor, acl_dst,
+ select_export_transpose);
}
}
* @note This function assumes quantized data types and is designed for
* MoE architectures with potential sparse expert routing.
*/
-static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
// TODO: Use aclnnGroupedMatMul
//dst [M, K, N, 1]
ggml_tensor * src0 = dst->src[0]; //src0 [D, M, A, 1]
GGML_TENSOR_BINARY_OP_LOCALS
// copy index from npu to cpu
- int64_t n_as = ne02; // A
- int64_t n_ids = ids->ne[0]; // K
+ int64_t n_as = ne02; // A
+ int64_t n_ids = ids->ne[0]; // K
std::vector<char> ids_host(ggml_nbytes(ids));
- ggml_cann_async_memcpy(ctx, ids_host.data(), ids->data, ggml_nbytes(ids),
- ACL_MEMCPY_DEVICE_TO_HOST);
+ ggml_cann_async_memcpy(ctx, ids_host.data(), ids->data, ggml_nbytes(ids), ACL_MEMCPY_DEVICE_TO_HOST);
ACL_CHECK(aclrtSynchronizeStream(ctx.stream()));
char * src0_original = (char *) src0->data;
char * src1_original = (char *) src1->data;
- char * dst_original = (char *) dst->data;
+ char * dst_original = (char *) dst->data;
ggml_tensor src0_row = *src0;
ggml_tensor src1_row = *src1;
- ggml_tensor dst_row = *dst;
+ ggml_tensor dst_row = *dst;
const enum ggml_type type = dst->src[0]->type;
- float weight_elem_size;
+ float weight_elem_size;
if (type == GGML_TYPE_Q4_0) {
weight_elem_size = float(sizeof(uint8_t)) / 2;
} else if (type == GGML_TYPE_Q8_0) {
}
// src0_row [D, M, 1, 1] weight without permute
- src0_row.ne[2] = 1;
- src0_row.ne[3] = 1;
- src0_row.nb[0] = weight_elem_size;
- src0_row.nb[1] = weight_elem_size * ne00;
- src0_row.nb[2] = weight_elem_size * ne00;
- src0_row.nb[3] = weight_elem_size * ne00;
+ src0_row.ne[2] = 1;
+ src0_row.ne[3] = 1;
+ src0_row.nb[0] = weight_elem_size;
+ src0_row.nb[1] = weight_elem_size * ne00;
+ src0_row.nb[2] = weight_elem_size * ne00;
+ src0_row.nb[3] = weight_elem_size * ne00;
size_t weight_stride = ne00 * ne01 * weight_elem_size;
- size_t weight_size = weight_stride * ne02 * ne03;
+ size_t weight_size = weight_stride * ne02 * ne03;
// scale [D, M, 1, 1] -> scale && permute
size_t scale_elem_size = sizeof(uint16_t);
- size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
+ size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
// src1_row [D, 1, 1, 1] -> input
src1_row.ne[1] = 1;
//create weight for one row
ggml_cann_pool_alloc weight_allocator(ctx.pool());
- void* weight_buffer = weight_allocator.alloc(nb02);
+ void * weight_buffer = weight_allocator.alloc(nb02);
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
for (int64_t id = 0; id < n_ids; id++) {
// expert index
- int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
+ int32_t i02 = *(int32_t *) (ids_host.data() + iid1 * ids->nb[1] + id * ids->nb[0]);
GGML_ASSERT(i02 >= 0 && i02 < n_as);
// If B = 1 (broadcast), always use 0; otherwise, use id.
int64_t i1 = id;
int64_t i2 = i12;
- void* src0_tmp_ptr = src0_original + i02*weight_stride;
- void* scale_tmp_ptr = src0_original + weight_size + i02*scale_stride;
- void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
- void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2;
+ void * src0_tmp_ptr = src0_original + i02 * weight_stride;
+ void * scale_tmp_ptr = src0_original + weight_size + i02 * scale_stride;
+ void * src1_tmp_ptr = src1_original + i11 * nb11 + i12 * nb12;
+ void * dst_tmp_ptr = dst_original + i1 * nb1 + i2 * nb2;
// mem cpy
- ggml_cann_async_memcpy(ctx, weight_buffer, src0_tmp_ptr, weight_stride,
- ACL_MEMCPY_DEVICE_TO_DEVICE);
- void* scale_buffer = (char*)weight_buffer + weight_stride;
- ggml_cann_async_memcpy(ctx, scale_buffer, scale_tmp_ptr, scale_stride,
- ACL_MEMCPY_DEVICE_TO_DEVICE);
-
- src0_row.data = weight_buffer;
- src1_row.data = src1_tmp_ptr;
- dst_row.data = dst_tmp_ptr;
+ ggml_cann_async_memcpy(ctx, weight_buffer, src0_tmp_ptr, weight_stride, ACL_MEMCPY_DEVICE_TO_DEVICE);
+ void * scale_buffer = (char *) weight_buffer + weight_stride;
+ ggml_cann_async_memcpy(ctx, scale_buffer, scale_tmp_ptr, scale_stride, ACL_MEMCPY_DEVICE_TO_DEVICE);
+
+ src0_row.data = weight_buffer;
+ src1_row.data = src1_tmp_ptr;
+ dst_row.data = dst_tmp_ptr;
dst_row.src[0] = &src0_row;
dst_row.src[1] = &src1_row;
return;
}
-void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+void ggml_cann_mul_mat_id(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
const enum ggml_type type = dst->src[0]->type;
switch (type) {
case GGML_TYPE_F32:
}
}
-void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
-
- ggml_tensor* src0 = dst->src[0]; // q, fp32 | B, N, S, D (uncont) -> B, S, N, D (cont)
- ggml_tensor* src1 = dst->src[1]; // k, fp16 | B, N, S, D (uncont) -> B, S, N, D (cont)
- ggml_tensor* src2 = dst->src[2]; // v, fp16 | B, N, S, D (uncont) -> B, S, N, D (cont)
- ggml_tensor* src3 = dst->src[3]; // mask, fp16
+void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
+ ggml_tensor * src0 = dst->src[0]; // q, fp32 | B, N, S, D (uncont) -> B, S, N, D (cont)
+ ggml_tensor * src1 = dst->src[1]; // k, fp16 | B, N, S, D (uncont) -> B, S, N, D (cont)
+ ggml_tensor * src2 = dst->src[2]; // v, fp16 | B, N, S, D (uncont) -> B, S, N, D (cont)
+ ggml_tensor * src3 = dst->src[3]; // mask, fp16
// B, N, S, D (uncont) -> B, S, N, D (cont)
int64_t src0_bsnd_ne[GGML_MAX_DIMS];
size_t src2_bsnd_nb[GGML_MAX_DIMS];
memcpy(src2_bsnd_nb, src2->nb, GGML_MAX_DIMS * sizeof(size_t));
- auto transpose12 = [](int64_t* ne, size_t* nb) {
+ auto transpose12 = [](int64_t * ne, size_t * nb) {
int64_t ne_tmp = ne[1];
size_t nb_tmp = nb[1];
- ne[1] = ne[2];
- nb[1] = nb[2];
- ne[2] = ne_tmp;
- nb[2] = nb_tmp;
+ ne[1] = ne[2];
+ nb[1] = nb[2];
+ ne[2] = ne_tmp;
+ nb[2] = nb_tmp;
};
transpose12(src0_bsnd_ne, src0_bsnd_nb);
transpose12(src1_bsnd_ne, src1_bsnd_nb);
transpose12(src2_bsnd_ne, src2_bsnd_nb);
- float maxBias = 0.0f;
- float scaleValue = 1.0f;
+ float maxBias = 0.0f;
+ float scaleValue = 1.0f;
float logitSoftcap = 0.0f;
- memcpy(&scaleValue, (float*)dst->op_params + 0, sizeof(float));
- memcpy(&maxBias, (float*)dst->op_params + 1, sizeof(float));
- memcpy(&logitSoftcap, (float*)dst->op_params + 2, sizeof(float));
+ memcpy(&scaleValue, (float *) dst->op_params + 0, sizeof(float));
+ memcpy(&maxBias, (float *) dst->op_params + 1, sizeof(float));
+ memcpy(&logitSoftcap, (float *) dst->op_params + 2, sizeof(float));
- if(logitSoftcap == 0.0f){
+ if (logitSoftcap == 0.0f) {
size_t faElemSize = sizeof(uint16_t);
- auto faDataType = ACL_FLOAT16; //ACL_BF16;
+ auto faDataType = ACL_FLOAT16; //ACL_BF16;
- aclTensor* acl_src0_f16_tensor = nullptr;
- aclTensor* acl_src1_f16_tensor = nullptr;
- aclTensor* acl_src2_f16_tensor = nullptr;
+ aclTensor * acl_src0_f16_tensor = nullptr;
+ aclTensor * acl_src1_f16_tensor = nullptr;
+ aclTensor * acl_src2_f16_tensor = nullptr;
// Step 1: cast the src0 (Query) to fp16 if needed
ggml_cann_pool_alloc src0_f16_allocator(ctx.pool());
- void* src0_f16_buffer = nullptr;
+ void * src0_f16_buffer = nullptr;
- if(ggml_cann_type_mapping(src0->type) != faDataType){
- aclTensor* acl_src0_f32_tensor = ggml_cann_create_tensor(src0, src0_bsnd_ne,
- src0_bsnd_nb, GGML_MAX_DIMS);
- src0_f16_buffer = src0_f16_allocator.alloc(
- ggml_nelements(src0) * faElemSize);
+ if (ggml_cann_type_mapping(src0->type) != faDataType) {
+ aclTensor * acl_src0_f32_tensor = ggml_cann_create_tensor(src0, src0_bsnd_ne, src0_bsnd_nb, GGML_MAX_DIMS);
+ src0_f16_buffer = src0_f16_allocator.alloc(ggml_nelements(src0) * faElemSize);
- int64_t* src0_f16_ne = src0_bsnd_ne;
- size_t src0_f16_nb[GGML_MAX_DIMS];
+ int64_t * src0_f16_ne = src0_bsnd_ne;
+ size_t src0_f16_nb[GGML_MAX_DIMS];
src0_f16_nb[0] = sizeof(uint16_t);
- for(int i = 1; i < GGML_MAX_DIMS; ++i){
+ for (int i = 1; i < GGML_MAX_DIMS; ++i) {
src0_f16_nb[i] = src0_f16_nb[i - 1] * src0_f16_ne[i - 1];
}
- acl_src0_f16_tensor = ggml_cann_create_tensor(
- src0_f16_buffer, faDataType, faElemSize,
- src0_f16_ne, src0_f16_nb, GGML_MAX_DIMS
- );
+ acl_src0_f16_tensor = ggml_cann_create_tensor(src0_f16_buffer, faDataType, faElemSize, src0_f16_ne,
+ src0_f16_nb, GGML_MAX_DIMS);
aclnn_cast(ctx, acl_src0_f32_tensor, acl_src0_f16_tensor, faDataType);
ggml_cann_release_resources(ctx, acl_src0_f32_tensor);
- }else{
- acl_src0_f16_tensor = ggml_cann_create_tensor(src0, src0_bsnd_ne,
- src0_bsnd_nb, GGML_MAX_DIMS);
+ } else {
+ acl_src0_f16_tensor = ggml_cann_create_tensor(src0, src0_bsnd_ne, src0_bsnd_nb, GGML_MAX_DIMS);
}
// Step 2: create the acl tensors for src1 (Key), src2 (Value),
// and the direct output from FusedInferAttention
- acl_src1_f16_tensor = ggml_cann_create_tensor(src1, src1_bsnd_ne,
- src1_bsnd_nb, GGML_MAX_DIMS);
- acl_src2_f16_tensor = ggml_cann_create_tensor(src2, src2_bsnd_ne,
- src2_bsnd_nb, GGML_MAX_DIMS);
+ acl_src1_f16_tensor = ggml_cann_create_tensor(src1, src1_bsnd_ne, src1_bsnd_nb, GGML_MAX_DIMS);
+ acl_src2_f16_tensor = ggml_cann_create_tensor(src2, src2_bsnd_ne, src2_bsnd_nb, GGML_MAX_DIMS);
// Step 3: create the PSEShift tensor if needed
// this tensor is considered as mask (f16) in the llama.cpp
- aclTensor* bcast_pse_tensor = nullptr;
+ aclTensor * bcast_pse_tensor = nullptr;
ggml_cann_pool_alloc bcast_pse_allocator(ctx.pool());
- if(src3 != nullptr){
+ if (src3 != nullptr) {
// Construct the truncated pse tensor (common for prefill/decode)
int64_t trunc_pse_ne[GGML_MAX_DIMS] = {
- src3->ne[0], // D
- src0->ne[1], // S (number of Q tokens)
- src3->ne[2], // mask N
- src3->ne[3] // B
+ src3->ne[0], // D
+ src0->ne[1], // S (number of Q tokens)
+ src3->ne[2], // mask N
+ src3->ne[3] // B
};
- size_t* trunc_pse_nb = src3->nb;
+ size_t * trunc_pse_nb = src3->nb;
- aclTensor* acl_mask_f16_trunc_tensor = ggml_cann_create_tensor(
- src3->data, ACL_FLOAT16, sizeof(uint16_t),
- trunc_pse_ne, trunc_pse_nb, GGML_MAX_DIMS
- );
+ aclTensor * acl_mask_f16_trunc_tensor = ggml_cann_create_tensor(src3->data, ACL_FLOAT16, sizeof(uint16_t),
+ trunc_pse_ne, trunc_pse_nb, GGML_MAX_DIMS);
int64_t bcast_pse_ne[GGML_MAX_DIMS];
- size_t bcast_pse_nb[GGML_MAX_DIMS];
- bcast_pse_ne[0] = src3->ne[0]; // D
- bcast_pse_ne[1] = src0->ne[1]; // S
- bcast_pse_ne[2] = src0->ne[2]; // N (num_heads)
- bcast_pse_ne[3] = src3->ne[3]; // B
+ size_t bcast_pse_nb[GGML_MAX_DIMS];
+ bcast_pse_ne[0] = src3->ne[0]; // D
+ bcast_pse_ne[1] = src0->ne[1]; // S
+ bcast_pse_ne[2] = src0->ne[2]; // N (num_heads)
+ bcast_pse_ne[3] = src3->ne[3]; // B
if (maxBias == 0.0f) {
// When maxBias == 0.0f, use nb = 0 reduce once repeat (Qwen2)
// Construct the bcast tensor (simulate repeat on the head dimension using stride=0)
bcast_pse_nb[0] = sizeof(uint16_t);
bcast_pse_nb[1] = bcast_pse_nb[0] * bcast_pse_ne[0];
- bcast_pse_nb[2] = 0; // <---- the head dimension shares the same data
+ bcast_pse_nb[2] = 0; // <---- the head dimension shares the same data
bcast_pse_nb[3] = src3->nb[3];
- bcast_pse_tensor = ggml_cann_create_tensor(
- src3->data, ACL_FLOAT16, sizeof(uint16_t),
- bcast_pse_ne, bcast_pse_nb, GGML_MAX_DIMS
- );
+ bcast_pse_tensor = ggml_cann_create_tensor(src3->data, ACL_FLOAT16, sizeof(uint16_t), bcast_pse_ne,
+ bcast_pse_nb, GGML_MAX_DIMS);
ggml_cann_release_resources(ctx, acl_mask_f16_trunc_tensor);
} else {
bcast_pse_nb[i] = bcast_pse_nb[i - 1] * bcast_pse_ne[i - 1];
}
- void* bcast_pse_buffer = bcast_pse_allocator.alloc(
- ggml_nelements(src3) * src0->ne[2] * sizeof(uint16_t)
- );
+ void * bcast_pse_buffer =
+ bcast_pse_allocator.alloc(ggml_nelements(src3) * src0->ne[2] * sizeof(uint16_t));
- bcast_pse_tensor = ggml_cann_create_tensor(
- bcast_pse_buffer, ACL_FLOAT16, sizeof(uint16_t),
- bcast_pse_ne, bcast_pse_nb, GGML_MAX_DIMS
- );
+ bcast_pse_tensor = ggml_cann_create_tensor(bcast_pse_buffer, ACL_FLOAT16, sizeof(uint16_t),
+ bcast_pse_ne, bcast_pse_nb, GGML_MAX_DIMS);
- int64_t repeats[] = {1, src0->ne[2], 1, 1};
+ int64_t repeats[] = { 1, src0->ne[2], 1, 1 };
aclnn_repeat(ctx, acl_mask_f16_trunc_tensor, bcast_pse_tensor, repeats);
// alibi
// Compute the slope if needed. Derived from ggml_cann_softmax().
- const int64_t n_heads = src0->ne[2];
+ const int64_t n_heads = src0->ne[2];
ggml_cann_pool_alloc slope_allocator(ctx.pool(), n_heads * sizeof(uint16_t));
- void* slope_buffer = slope_allocator.get();
+ void * slope_buffer = slope_allocator.get();
aclnn_get_slope(ctx, n_heads, slope_buffer, maxBias, GGML_TYPE_F16);
- int64_t slope_ne[] = {1, 1, n_heads, 1};
- size_t slope_nb[GGML_MAX_DIMS];
+ int64_t slope_ne[] = { 1, 1, n_heads, 1 };
+ size_t slope_nb[GGML_MAX_DIMS];
slope_nb[0] = sizeof(uint16_t);
- for(int i = 1;i<GGML_MAX_DIMS;i++) {
- slope_nb[i] = slope_nb[i-1] * slope_ne[0];
+ for (int i = 1; i < GGML_MAX_DIMS; i++) {
+ slope_nb[i] = slope_nb[i - 1] * slope_ne[0];
}
- aclTensor* slope_tensor = ggml_cann_create_tensor(
- slope_buffer, ACL_FLOAT16, sizeof(uint16_t),
- slope_ne, slope_nb, GGML_MAX_DIMS);
+ aclTensor * slope_tensor = ggml_cann_create_tensor(slope_buffer, ACL_FLOAT16, sizeof(uint16_t),
+ slope_ne, slope_nb, GGML_MAX_DIMS);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, bcast_pse_tensor, slope_tensor);
ggml_cann_release_resources(ctx, slope_tensor, acl_mask_f16_trunc_tensor);
}
// Step 4: set the inputs for FusedInferAttention.
- int kvTensorNum = 1;
- aclTensor* acl_q_tensor = acl_src0_f16_tensor;
- aclTensor* acl_k_tensors[] = {acl_src1_f16_tensor};
- aclTensor* acl_v_tensors[] = {acl_src2_f16_tensor};
- aclTensorList* acl_k_tensor_list = aclCreateTensorList(acl_k_tensors, kvTensorNum);
- aclTensorList* acl_v_tensor_list = aclCreateTensorList(acl_v_tensors, kvTensorNum);
-
- int64_t numHeads = src0->ne[2]; // N
- int64_t numKeyValueHeads = src1->ne[2];
+ int kvTensorNum = 1;
+ aclTensor * acl_q_tensor = acl_src0_f16_tensor;
+ aclTensor * acl_k_tensors[] = { acl_src1_f16_tensor };
+ aclTensor * acl_v_tensors[] = { acl_src2_f16_tensor };
+ aclTensorList * acl_k_tensor_list = aclCreateTensorList(acl_k_tensors, kvTensorNum);
+ aclTensorList * acl_v_tensor_list = aclCreateTensorList(acl_v_tensors, kvTensorNum);
+
+ int64_t numHeads = src0->ne[2]; // N
+ int64_t numKeyValueHeads = src1->ne[2];
// double scaleValue = 1 / sqrt(src0->ne[0]); // 1/sqrt(d)
- int64_t preTokens = 65535;
- int64_t nextTokens = 65535;
- char layout[5] = {'B', 'S', 'N', 'D', 0};
- int64_t sparseMode = 0;
- int64_t innerPrecise = (src0->ne[1] == 1) ? 0 : 2;
- int64_t blockSize = 0;
- int64_t antiquantMode = 0;
- bool softmaxLseFlag = false;
- int64_t keyAntiquantMode = 0;
+ int64_t preTokens = 65535;
+ int64_t nextTokens = 65535;
+ char layout[5] = { 'B', 'S', 'N', 'D', 0 };
+ int64_t sparseMode = 0;
+ int64_t innerPrecise = (src0->ne[1] == 1) ? 0 : 2;
+ int64_t blockSize = 0;
+ int64_t antiquantMode = 0;
+ bool softmaxLseFlag = false;
+ int64_t keyAntiquantMode = 0;
int64_t valueAntiquantMode = 0;
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
- aclTensor * fa_dst_tensor = nullptr;
- aclTensor * acl_dst_tensor = nullptr;
+ aclTensor * fa_dst_tensor = nullptr;
+ aclTensor * acl_dst_tensor = nullptr;
ggml_cann_pool_alloc out_f16_allocator(ctx.pool());
if (dst->type == GGML_TYPE_F32) {
- void* out_f16_buffer = out_f16_allocator.alloc(
- ggml_nelements(dst) * faElemSize);
+ void * out_f16_buffer = out_f16_allocator.alloc(ggml_nelements(dst) * faElemSize);
- int64_t* out_f16_ne = src0_bsnd_ne;
- size_t out_f16_nb[GGML_MAX_DIMS];
+ int64_t * out_f16_ne = src0_bsnd_ne;
+ size_t out_f16_nb[GGML_MAX_DIMS];
out_f16_nb[0] = faElemSize;
- for(int i = 1; i < GGML_MAX_DIMS; ++i){
+ for (int i = 1; i < GGML_MAX_DIMS; ++i) {
out_f16_nb[i] = out_f16_nb[i - 1] * out_f16_ne[i - 1];
}
- fa_dst_tensor = ggml_cann_create_tensor(
- out_f16_buffer, faDataType, faElemSize,
- out_f16_ne, out_f16_nb, GGML_MAX_DIMS
- );
- }
- else {
+ fa_dst_tensor =
+ ggml_cann_create_tensor(out_f16_buffer, faDataType, faElemSize, out_f16_ne, out_f16_nb, GGML_MAX_DIMS);
+ } else {
fa_dst_tensor = ggml_cann_create_tensor(dst);
}
- GGML_CANN_CALL_ACLNN_OP(ctx, FusedInferAttentionScoreV2,
- acl_q_tensor, acl_k_tensor_list, acl_v_tensor_list, // q, k, v
- bcast_pse_tensor, nullptr, // pse, mask
- nullptr, nullptr, // actSeqLen, actSeqLenkv
- nullptr, nullptr, // deqScale1, quantScale1
- nullptr, nullptr, nullptr, // deqScale2, quantScale2, quantOffset2
- nullptr, nullptr, // antiquantScale, antiquantOffset
- nullptr, // blockTable
- nullptr, nullptr, // qPadSize, kvPadSize
- nullptr, nullptr, // kAntiquantScale, kAntiQuantOffset
- nullptr, nullptr, // vAntiquantScale, vAntiQuantOffset
- nullptr, nullptr, nullptr, // kSharedPrefix, vSharedPrefix, actSharedLen
- numHeads, scaleValue, // heads, scaleValue
- preTokens, nextTokens, // preTokens, nextTokens
- layout, // inputLayout
- numKeyValueHeads, // numKVHeads
- sparseMode, innerPrecise, // sparseMode, innerPrecise
- blockSize, antiquantMode, // blockSize, antiquantMode
- softmaxLseFlag, // softmaxLseFlag
- keyAntiquantMode, valueAntiquantMode, // keyAntiqMode, valueAntiqMode
- fa_dst_tensor, // attentionOut
- nullptr // softmaxLse
+ GGML_CANN_CALL_ACLNN_OP(ctx, FusedInferAttentionScoreV2, acl_q_tensor, acl_k_tensor_list,
+ acl_v_tensor_list, // q, k, v
+ bcast_pse_tensor, nullptr, // pse, mask
+ nullptr, nullptr, // actSeqLen, actSeqLenkv
+ nullptr, nullptr, // deqScale1, quantScale1
+ nullptr, nullptr, nullptr, // deqScale2, quantScale2, quantOffset2
+ nullptr, nullptr, // antiquantScale, antiquantOffset
+ nullptr, // blockTable
+ nullptr, nullptr, // qPadSize, kvPadSize
+ nullptr, nullptr, // kAntiquantScale, kAntiQuantOffset
+ nullptr, nullptr, // vAntiquantScale, vAntiQuantOffset
+ nullptr, nullptr, nullptr, // kSharedPrefix, vSharedPrefix, actSharedLen
+ numHeads, scaleValue, // heads, scaleValue
+ preTokens, nextTokens, // preTokens, nextTokens
+ layout, // inputLayout
+ numKeyValueHeads, // numKVHeads
+ sparseMode, innerPrecise, // sparseMode, innerPrecise
+ blockSize, antiquantMode, // blockSize, antiquantMode
+ softmaxLseFlag, // softmaxLseFlag
+ keyAntiquantMode, valueAntiquantMode, // keyAntiqMode, valueAntiqMode
+ fa_dst_tensor, // attentionOut
+ nullptr // softmaxLse
);
if (dst->type == GGML_TYPE_F32) {
// Step 6: post-processing, permute and cast to f32
- aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
+ aclTensor * acl_dst_tensor = ggml_cann_create_tensor(dst);
aclnn_cast(ctx, fa_dst_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
}
- ggml_cann_release_resources(ctx, acl_src0_f16_tensor,
- acl_k_tensor_list,
- acl_v_tensor_list,
- fa_dst_tensor,
- acl_dst_tensor,
- bcast_pse_tensor);
+ ggml_cann_release_resources(ctx, acl_src0_f16_tensor, acl_k_tensor_list, acl_v_tensor_list, fa_dst_tensor,
+ acl_dst_tensor, bcast_pse_tensor);
} else {
GGML_ABORT("Function is not implemented.");
* @param line The line number where the error occurred.
* @param msg The error message.
*/
-[[noreturn]] void ggml_cann_error(const char* stmt, const char* func,
- const char* file, int line, const char* msg) {
+[[noreturn]] void ggml_cann_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
int32_t id = -1;
aclrtGetDevice(&id);
GGML_LOG_ERROR("CANN error: %s\n", msg);
- GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func,
- file, line);
+ GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
GGML_LOG_ERROR(" %s\n", stmt);
// abort with GGML_ASSERT to get a stack trace
GGML_ABORT("CANN error");
aclrtGetDevice(¤t_device);
if (device == current_device) {
- return;
+ return;
}
ACL_CHECK(aclrtSetDevice(device));
}
* @brief Get the value of the specified environment variable (name).
* if not empty, return a std::string object
*/
-std::optional<std::string> get_env(const std::string& name) {
- const char* val = std::getenv(name.c_str());
- if (!val) return std::nullopt;
+std::optional<std::string> get_env(const std::string & name) {
+ const char * val = std::getenv(name.c_str());
+ if (!val) {
+ return std::nullopt;
+ }
std::string res = std::string(val);
std::transform(res.begin(), res.end(), res.begin(), ::tolower);
return res;
/**
* @brief Verify whether the environment variable is a valid value.
*/
-bool parse_bool(const std::string& value) {
- std::unordered_set<std::string> valid_values = {"on", "1", "yes", "y", "enable", "true"};
+bool parse_bool(const std::string & value) {
+ std::unordered_set<std::string> valid_values = { "on", "1", "yes", "y", "enable", "true" };
return valid_values.find(value) != valid_values.end();
}
* @param value The string to parse.
* @return The parsed integer, or 0 if conversion fails.
*/
-int parse_integer(const std::string& value) {
+int parse_integer(const std::string & value) {
try {
return std::stoi(value);
} catch (...) {
static ggml_cann_device_info ggml_cann_init() {
ggml_cann_device_info info = {};
- aclError err = aclrtGetDeviceCount((uint32_t*)&info.device_count);
+ aclError err = aclrtGetDeviceCount((uint32_t *) &info.device_count);
if (err != ACL_SUCCESS) {
- GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n",
- __func__, aclGetRecentErrMsg());
+ GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n", __func__, aclGetRecentErrMsg());
return info;
}
for (int id = 0; id < info.device_count; ++id) {
aclrtPhysicalMemProp prop = {};
- prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
- prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
- prop.memAttr = ACL_HBM_MEM_HUGE;
- prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
- prop.location.id = id;
- prop.reserve = 0;
- err = aclrtMemGetAllocationGranularity(
- &prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
- &info.devices[id].vmm_granularity);
- info.devices[id].vmm = err == ACL_SUCCESS;
+ prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
+ prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
+ prop.memAttr = ACL_HBM_MEM_HUGE;
+ prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
+ prop.location.id = id;
+ prop.reserve = 0;
+ err = aclrtMemGetAllocationGranularity(&prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
+ &info.devices[id].vmm_granularity);
+ info.devices[id].vmm = err == ACL_SUCCESS;
size_t free, total;
ggml_backend_cann_get_device_memory(id, &free, &total);
*
* @return A reference to the structure containing the device information.
*/
-const ggml_cann_device_info& ggml_cann_info() {
+const ggml_cann_device_info & ggml_cann_info() {
static ggml_cann_device_info info = ggml_cann_init();
return info;
}
/**
* @brief The minimum free margin for a buffer.
*/
- static const size_t min_free_margin = 1ull << 20; // 1MB
+ static const size_t min_free_margin = 1ull << 20; // 1MB
/**
* @brief The alignment for buffer allocation.
* @brief Structure representing a CANN buffer.
*/
struct ggml_cann_buffer {
- void* ptr = nullptr; ///< Pointer to the buffer.
- size_t size = 0; ///< Size of the buffer.
- std::chrono::steady_clock::time_point last_used; ///< Last used time.
+ void * ptr = nullptr; ///< Pointer to the buffer.
+ size_t size = 0; ///< Size of the buffer.
+ std::chrono::steady_clock::time_point last_used; ///< Last used time.
- bool operator>(const ggml_cann_buffer& other) const {
- return size > other.size;
- }
+ bool operator>(const ggml_cann_buffer & other) const { return size > other.size; }
};
/**
* @brief Array of CANN buffers in the pool.
*/
- std::unordered_map<void*, size_t> buffer_pool;
- std::priority_queue<ggml_cann_buffer,
- std::vector<ggml_cann_buffer>,
- std::greater<>> free_buffers ;
+ std::unordered_map<void *, size_t> buffer_pool;
+ std::priority_queue<ggml_cann_buffer, std::vector<ggml_cann_buffer>, std::greater<>> free_buffers;
/**
* @brief Total size of all buffers in the pool.
*/
~ggml_cann_pool_buf_prio() {
ggml_cann_set_device(device);
- for (auto& [b_ptr, b_size] : buffer_pool) {
+ for (auto & [b_ptr, b_size] : buffer_pool) {
aclrtFree(b_ptr);
pool_size -= b_size;
}
* the allocated buffer.
* @return A pointer to the allocated buffer.
*/
- void* alloc(size_t size, size_t* actual_size) override {
+ void * alloc(size_t size, size_t * actual_size) override {
size = GGML_PAD(size, alignment);
if (size == 0) {
size = alignment;
}
- void* ptr = nullptr;
- auto now = std::chrono::steady_clock::now();
+ void * ptr = nullptr;
+ auto now = std::chrono::steady_clock::now();
std::vector<ggml_cann_buffer> free_buffers_rest;
free_buffers_rest.reserve(free_buffers.size());
const size_t margin = b.size - size;
if (margin <= max_reuse_margin) {
*actual_size = b.size;
- ptr = b.ptr;
+ ptr = b.ptr;
#ifdef DEBUG_CANN_MALLOC
GGML_LOG_INFO(
"cann pool[%d]: reused %p, "
"pool_size = %5u MB, "
"size = %5u MB, "
"margin = %5u MB\n",
- device, b.ptr,
- (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
- (uint32_t)(GGML_PAD(size, 1048576) / 1048576),
- (uint32_t)(GGML_PAD(margin, 1048576) / 1048576));
+ device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
+ (uint32_t) (GGML_PAD(size, 1048576) / 1048576),
+ (uint32_t) (GGML_PAD(margin, 1048576) / 1048576));
#endif
break;
}
}
- bool should_clean = !disable_clean &&
- b.size > min_free_margin &&
+ bool should_clean = !disable_clean && b.size > min_free_margin &&
std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
if (should_clean) {
// free the buffer if the size is needed to be freed
"cann pool[%d]: clean %p, "
"pool_size = %5u MB, "
"size = %5u MB\n",
- device, b.ptr,
- (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
- (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
+ device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
+ (uint32_t) (GGML_PAD(b.size, 1048576) / 1048576));
#endif
continue;
}
free_buffers_rest.push_back(b);
}
- for (ggml_cann_buffer &b : free_buffers_rest) {
+ for (ggml_cann_buffer & b : free_buffers_rest) {
free_buffers.push(std::move(b));
}
#ifdef DEBUG_CANN_MALLOC
- GGML_LOG_INFO("cann pool[%d] free pool_size = %5u MB\n\n", device, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
+ GGML_LOG_INFO("cann pool[%d] free pool_size = %5u MB\n\n", device,
+ (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576));
#endif
if (ptr != nullptr) {
return ptr;
"cann pool[%d]: allocate %p, "
"pool_size = %5u MB, "
"size = %5u MB\n",
- device, ptr, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
- (uint32_t)(GGML_PAD(size, 1048576) / 1048576));
+ device, ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
+ (uint32_t) (GGML_PAD(size, 1048576) / 1048576));
#endif
buffer_pool.emplace(ptr, size);
return ptr;
* @param ptr Pointer to the buffer to free.
* @param size Size of the buffer to free.
*/
- void free(void* ptr, size_t size) override {
+ void free(void * ptr, size_t size) override {
GGML_UNUSED(size);
auto it = buffer_pool.find(ptr);
if (it == buffer_pool.end()) {
}
auto now = std::chrono::steady_clock::now();
- free_buffers.emplace(ggml_cann_buffer{ptr, it->second, now});
+ free_buffers.emplace(ggml_cann_buffer{ ptr, it->second, now });
#ifdef DEBUG_CANN_MALLOC
GGML_LOG_INFO(
"cann pool[%d]: return %p, "
"pool_size = %5u MB\n",
- device, ptr,
- (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
+ device, ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576));
#endif
}
};
/**
* @brief The minimum free margin for a buffer.
*/
- static const size_t min_free_margin = 1ull << 20; // 1MB
+ static const size_t min_free_margin = 1ull << 20; // 1MB
/**
* @brief The alignment for buffer allocation.
* @brief Structure representing a CANN buffer.
*/
struct ggml_cann_buffer {
- void* ptr = nullptr; ///< Pointer to the buffer memory.
- size_t size = 0; ///< Size of the buffer.
- bool used = false; ///< Whether the buffer is currently in use.
- std::chrono::steady_clock::time_point last_used; ///< Last used time.
+ void * ptr = nullptr; ///< Pointer to the buffer memory.
+ size_t size = 0; ///< Size of the buffer.
+ bool used = false; ///< Whether the buffer is currently in use.
+ std::chrono::steady_clock::time_point last_used; ///< Last used time.
};
/**
~ggml_cann_pool_buf() {
ggml_cann_set_device(device);
for (int i = 0; i < MAX_BUFFERS; ++i) {
- ggml_cann_buffer& b = buffer_pool[i];
+ ggml_cann_buffer & b = buffer_pool[i];
if (b.ptr != nullptr) {
aclrtFree(b.ptr);
pool_size -= b.size;
* the allocated buffer.
* @return A pointer to the allocated buffer.
*/
- void* alloc(size_t size, size_t* actual_size) override {
+ void * alloc(size_t size, size_t * actual_size) override {
size = GGML_PAD(size, alignment);
if (size == 0) {
size = alignment;
}
- void* ptr = nullptr;
- auto now = std::chrono::steady_clock::now();
+ void * ptr = nullptr;
+ auto now = std::chrono::steady_clock::now();
int i = 0;
for (; i < MAX_BUFFERS; ++i) {
- ggml_cann_buffer& b = buffer_pool[i];
+ ggml_cann_buffer & b = buffer_pool[i];
if (b.ptr == nullptr) {
break;
}
const size_t margin = b.size - size;
if (margin <= max_reuse_margin) {
*actual_size = b.size;
- b.used = true;
- ptr = b.ptr;
+ b.used = true;
+ ptr = b.ptr;
#ifdef DEBUG_CANN_MALLOC
GGML_LOG_INFO(
"cann pool[%d]: reused %p, "
"pool_size = %5u MB, "
"size = %5u MB, "
"margin = %5u MB\n",
- device, b.ptr,
- (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
- (uint32_t)(GGML_PAD(size, 1048576) / 1048576),
- (uint32_t)(GGML_PAD(margin, 1048576) / 1048576));
+ device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
+ (uint32_t) (GGML_PAD(size, 1048576) / 1048576),
+ (uint32_t) (GGML_PAD(margin, 1048576) / 1048576));
#endif
break;
}
}
- bool should_clean = !disable_clean &&
- b.size > min_free_margin &&
+ bool should_clean = !disable_clean && b.size > min_free_margin &&
std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
if (should_clean) {
// free the buffer if the size is needed to be freed
"cann pool[%d]: clean %p, "
"pool_size = %5u MB, "
"size = %5u MB\n",
- device, b.ptr,
- (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
- (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
+ device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
+ (uint32_t) (GGML_PAD(b.size, 1048576) / 1048576));
#endif
b.ptr = nullptr;
}
if (i < MAX_BUFFERS) {
// allocate a new buffer if no buffer can be reused
- ggml_cann_buffer& b = buffer_pool[i];
+ ggml_cann_buffer & b = buffer_pool[i];
ggml_cann_set_device(device);
ACL_CHECK(aclrtMalloc(&b.ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
pool_size += size;
*actual_size = size;
- b.size = size;
- b.used = true;
+ b.size = size;
+ b.used = true;
if (i >= MAX_BUFFERS - 8) {
GGML_LOG_WARN("cann pool[%d]: slots almost full\n", device);
}
"cann pool[%d]: allocate %p, "
"pool_size = %5u MB, "
"size = %5u MB\n",
- device, b.ptr,
- (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
- (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
+ device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
+ (uint32_t) (GGML_PAD(b.size, 1048576) / 1048576));
#endif
return b.ptr;
}
* @param ptr Pointer to the buffer to free.
* @param size Size of the buffer to free.
*/
- void free(void* ptr, size_t size) override {
+ void free(void * ptr, size_t size) override {
GGML_UNUSED(size);
for (int i = 0; i < MAX_BUFFERS; ++i) {
- ggml_cann_buffer& b = buffer_pool[i];
+ ggml_cann_buffer & b = buffer_pool[i];
if (b.ptr != ptr) {
continue;
}
- b.used = false;
+ b.used = false;
b.last_used = std::chrono::steady_clock::now();
#ifdef DEBUG_CANN_MALLOC
GGML_LOG_INFO(
"cann pool[%d]: return %p, "
"pool_size = %5u MB\n",
- device, b.ptr,
- (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
+ device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576));
#endif
return;
}
/**
* @brief Pointer to the start of the virtual memory pool.
*/
- void* pool_addr = 0;
+ void * pool_addr = 0;
/**
* @brief Amount of virtual memory used in the pool.
/**
* @brief Offsets for the mapped memory regions.
*/
- std::vector<void*> map_offsets;
+ std::vector<void *> map_offsets;
/**
* @brief Constructor to initialize the buffer pool with virtual memory for
*
* @param device The device ID to associate with this buffer pool.
*/
- explicit ggml_cann_pool_vmm(int device)
- : device(device) {
- auto dev = ggml_cann_info().devices[device];
+ explicit ggml_cann_pool_vmm(int device) : device(device) {
+ auto dev = ggml_cann_info().devices[device];
granularity = dev.vmm_granularity;
- max_size = dev.total_vram;
+ max_size = dev.total_vram;
}
/**
*/
~ggml_cann_pool_vmm() {
if (pool_addr != 0) {
- for (auto& offset : map_offsets) {
+ for (auto & offset : map_offsets) {
ACL_CHECK(aclrtUnmapMem(offset));
}
- for (auto& handle : handles) {
+ for (auto & handle : handles) {
ACL_CHECK(aclrtFreePhysical(handle));
}
ACL_CHECK(aclrtReleaseMemAddress(pool_addr));
* the allocated buffer.
* @return A pointer to the allocated buffer.
*/
- void* alloc(size_t size, size_t* actual_size) override {
+ void * alloc(size_t size, size_t * actual_size) override {
// round up the allocation size to the alignment to ensure that all
// allocations are aligned for all data types
const size_t alignment = 128;
- size = GGML_PAD(size, alignment);
+ size = GGML_PAD(size, alignment);
if (size == 0) {
size = alignment;
}
if (size > avail) {
// round up to the next multiple of the granularity
size_t reserve_size = size - avail;
- reserve_size = GGML_PAD(reserve_size, granularity);
+ reserve_size = GGML_PAD(reserve_size, granularity);
GGML_ASSERT(pool_size + reserve_size <= max_size);
// allocate more physical memory
aclrtPhysicalMemProp prop = {};
- prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
- prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
- prop.memAttr = ACL_HBM_MEM_HUGE;
- prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
- prop.location.id = device;
- prop.reserve = 0;
+ prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
+ prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
+ prop.memAttr = ACL_HBM_MEM_HUGE;
+ prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
+ prop.location.id = device;
+ prop.reserve = 0;
aclrtDrvMemHandle handle;
ACL_CHECK(aclrtMallocPhysical(&handle, reserve_size, &prop, 0));
// reserve virtual address space (if not already reserved)
if (pool_addr == 0) {
- ACL_CHECK(aclrtReserveMemAddress(
- &pool_addr, max_size, 0, NULL, 1));
+ ACL_CHECK(aclrtReserveMemAddress(&pool_addr, max_size, 0, NULL, 1));
}
// map at the end of the pool
- ACL_CHECK(aclrtMapMem((char*)pool_addr + pool_size, reserve_size, 0,
- handle, 0));
+ ACL_CHECK(aclrtMapMem((char *) pool_addr + pool_size, reserve_size, 0, handle, 0));
handles.push_back(handle);
- map_offsets.push_back((char*)pool_addr + pool_size);
+ map_offsets.push_back((char *) pool_addr + pool_size);
// add to the pool
pool_size += reserve_size;
#ifdef DEBUG_CANN_MALLOC
- GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
- device, (unsigned long long) (pool_size/1024/1024),
- (unsigned long long) (reserve_size/1024/1024));
+ GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n", device,
+ (unsigned long long) (pool_size / 1024 / 1024),
+ (unsigned long long) (reserve_size / 1024 / 1024));
#endif
}
GGML_ASSERT(pool_addr != 0);
- void* ptr = (void*)((char*)pool_addr + pool_used);
+ void * ptr = (void *) ((char *) pool_addr + pool_used);
*actual_size = size;
pool_used += size;
#ifdef DEBUG_CANN_MALLOC
- GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device,
- (unsigned long long)size, (unsigned long long)ptr);
+ GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long) size,
+ (unsigned long long) ptr);
#endif
return ptr;
}
* @param ptr Pointer to the buffer to free.
* @param size Size of the buffer to free.
*/
- void free(void* ptr, size_t size) override {
+ void free(void * ptr, size_t size) override {
#ifdef DEBUG_CANN_MALLOC
- GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device,
- (unsigned long long)size, (unsigned long long)ptr);
+ GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long) size,
+ (unsigned long long) ptr);
#endif
pool_used -= size;
// all deallocations must be in reverse order of the allocations
- GGML_ASSERT(ptr == (void*)((char*)pool_addr + pool_used));
+ GGML_ASSERT(ptr == (void *) ((char *) pool_addr + pool_used));
}
};
* @param device The device ID for which to create the pool.
* @return A unique pointer to the created CANN pool.
*/
-std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
- int device) {
+std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(int device) {
std::string mem_pool_type = get_env("GGML_CANN_MEM_POOL").value_or("");
if (mem_pool_type == "prio") {
* ID, device pointer, and a name derived from GGML_CANN_NAME and the device ID.
*/
struct ggml_backend_cann_buffer_context {
- int32_t device; ///< The device ID associated with this buffer context.
- void* dev_ptr =
- nullptr; ///< Pointer to the device memory allocated for the buffer.
+ int32_t device; ///< The device ID associated with this buffer context.
+ void * dev_ptr = nullptr; ///< Pointer to the device memory allocated for the buffer.
/**
* @brief Constructor to initialize the CANN buffer context.
* @param device The device ID associated with this buffer context.
* @param dev_ptr Pointer to the device memory allocated for the buffer.
*/
- ggml_backend_cann_buffer_context(int32_t device, void* dev_ptr)
- : device(device),
- dev_ptr(dev_ptr) {}
+ ggml_backend_cann_buffer_context(int32_t device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
/**
* @brief Destructor to free the device memory allocated for the buffer.
* @return true if the buffer is a CANN buffer, false otherwise.
*/
static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft);
-static bool ggml_backend_buffer_is_cann(
- ggml_backend_buffer_t buffer) {
+
+static bool ggml_backend_buffer_is_cann(ggml_backend_buffer_t buffer) {
return ggml_backend_buft_is_cann(buffer->buft);
}
*
* @param buffer The CANN buffer to free.
*/
-static void ggml_backend_cann_buffer_free_buffer(
- ggml_backend_buffer_t buffer) {
- ggml_backend_cann_buffer_context* ctx =
- (ggml_backend_cann_buffer_context*)buffer->context;
+static void ggml_backend_cann_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+ ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
delete ctx;
}
* @param buffer The CANN buffer whose base pointer is to be retrieved.
* @return A pointer to the base of the device memory allocated for the buffer.
*/
-static void* ggml_backend_cann_buffer_get_base(
- ggml_backend_buffer_t buffer) {
- ggml_backend_cann_buffer_context* ctx =
- (ggml_backend_cann_buffer_context*)buffer->context;
+static void * ggml_backend_cann_buffer_get_base(ggml_backend_buffer_t buffer) {
+ ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
return ctx->dev_ptr;
}
* @param dst Pointer to the destination buffer where transformed data will be
* stored.
*/
-static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
- const void* src,
- void* dst) {
-
- int64_t n_elems = ggml_nelements(tensor);
- int64_t groups = n_elems / QK4_0;
- size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
+static void ggml_backend_cann_transform_q4_0(ggml_tensor * tensor, const void * src, void * dst) {
+ int64_t n_elems = ggml_nelements(tensor);
+ int64_t groups = n_elems / QK4_0;
+ size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
- uint8_t* quant_offset = (uint8_t*)dst;
- uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes);
+ uint8_t * quant_offset = (uint8_t *) dst;
+ uint16_t * scale_offset = (uint16_t *) ((char *) dst + quant_bytes);
for (int i = 0; i < groups; i++) {
- const block_q4_0* group =
- (const block_q4_0*)((const char*)src + i * sizeof(block_q4_0));
- *scale_offset = group->d;
+ const block_q4_0 * group = (const block_q4_0 *) ((const char *) src + i * sizeof(block_q4_0));
+ *scale_offset = group->d;
scale_offset++;
// 0-15
}
// put (uint4b_t -8) into int4b_t
- for (quant_offset = (uint8_t*)dst;
- quant_offset < (uint8_t*)dst + quant_bytes; quant_offset++) {
+ for (quant_offset = (uint8_t *) dst; quant_offset < (uint8_t *) dst + quant_bytes; quant_offset++) {
(*quant_offset) ^= 0x88;
}
}
* @param dst Pointer to the destination buffer where the Q4.0 formatted data
* will be stored.
*/
-static void ggml_backend_cann_transform_back_q4_0(
- const ggml_tensor* tensor, void* src, void* dst) {
+static void ggml_backend_cann_transform_back_q4_0(const ggml_tensor * tensor, void * src, void * dst) {
+ int64_t n_elems = ggml_nelements(tensor);
+ int64_t groups = n_elems / QK4_0;
+ size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
- int64_t n_elems = ggml_nelements(tensor);
- int64_t groups = n_elems / QK4_0;
- size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
+ uint8_t * quant_offset = (uint8_t *) src;
+ uint16_t * scale_offset = (uint16_t *) ((char *) src + quant_bytes);
- uint8_t* quant_offset = (uint8_t*)src;
- uint16_t* scale_offset = (uint16_t*)((char*)src + quant_bytes);
-
- for (; quant_offset < (uint8_t*)src + quant_bytes; quant_offset++) {
+ for (; quant_offset < (uint8_t *) src + quant_bytes; quant_offset++) {
(*quant_offset) ^= 0x88;
}
- quant_offset = (uint8_t*)src;
+ quant_offset = (uint8_t *) src;
for (int i = 0; i < groups; i++) {
- block_q4_0* group = (block_q4_0*)((char*)dst + i * sizeof(block_q4_0));
- group->d = *scale_offset;
+ block_q4_0 * group = (block_q4_0 *) ((char *) dst + i * sizeof(block_q4_0));
+ group->d = *scale_offset;
scale_offset++;
// 0-15
for (int j = 0; j < QK4_0 / 2; j += 2) {
- group->qs[j] = ((*quant_offset) & 0x0F);
+ group->qs[j] = ((*quant_offset) & 0x0F);
group->qs[j + 1] = ((*quant_offset) >> 4);
quant_offset++;
}
* @param dst Pointer to the destination buffer where transformed data will be
* stored.
*/
-static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
- const void* src,
- void* dst) {
- int64_t n_elems = ggml_nelements(tensor);
- int64_t groups = n_elems / QK8_0;
- size_t quant_bytes = n_elems * sizeof(uint8_t);
+static void ggml_backend_cann_transform_q8_0(ggml_tensor * tensor, const void * src, void * dst) {
+ int64_t n_elems = ggml_nelements(tensor);
+ int64_t groups = n_elems / QK8_0;
+ size_t quant_bytes = n_elems * sizeof(uint8_t);
- uint8_t* quant_offset = (uint8_t*)dst;
- uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes);
+ uint8_t * quant_offset = (uint8_t *) dst;
+ uint16_t * scale_offset = (uint16_t *) ((char *) dst + quant_bytes);
for (int i = 0; i < groups; i++) {
- const block_q8_0* group =
- (const block_q8_0*)((const char*)src + i * sizeof(block_q8_0));
- *scale_offset = group->d;
+ const block_q8_0 * group = (const block_q8_0 *) ((const char *) src + i * sizeof(block_q8_0));
+ *scale_offset = group->d;
scale_offset++;
size_t group_quant_size = QK8_0 * sizeof(uint8_t);
memcpy(quant_offset, group->qs, group_quant_size);
* @param dst Pointer to the destination buffer where the Q8.0 formatted data
* will be stored.
*/
-static void ggml_backend_cann_transform_back_q8_0(
- const ggml_tensor* tensor, const void* src, void* dst) {
- int64_t n_elems = ggml_nelements(tensor);
- int64_t groups = n_elems / QK8_0;
- size_t quant_bytes = n_elems * sizeof(uint8_t);
+static void ggml_backend_cann_transform_back_q8_0(const ggml_tensor * tensor, const void * src, void * dst) {
+ int64_t n_elems = ggml_nelements(tensor);
+ int64_t groups = n_elems / QK8_0;
+ size_t quant_bytes = n_elems * sizeof(uint8_t);
- const uint8_t* quant_offset = (const uint8_t*)src;
- const uint16_t* scale_offset =
- (const uint16_t*)((const char*)src + quant_bytes);
+ const uint8_t * quant_offset = (const uint8_t *) src;
+ const uint16_t * scale_offset = (const uint16_t *) ((const char *) src + quant_bytes);
for (int i = 0; i < groups; i++) {
- block_q8_0* group = (block_q8_0*)((char*)dst + i * sizeof(block_q8_0));
- group->d = *scale_offset;
+ block_q8_0 * group = (block_q8_0 *) ((char *) dst + i * sizeof(block_q8_0));
+ group->d = *scale_offset;
scale_offset++;
size_t group_quant_size = QK8_0 * sizeof(uint8_t);
memcpy(group->qs, quant_offset, group_quant_size);
* @param dst Pointer to the destination buffer where transformed data will be
* stored.
*/
-static void ggml_backend_cann_transform(ggml_tensor* tensor,
- const void* src, void* dst) {
+static void ggml_backend_cann_transform(ggml_tensor * tensor, const void * src, void * dst) {
switch (tensor->type) {
case GGML_TYPE_Q4_0:
ggml_backend_cann_transform_q4_0(tensor, src, dst);
* @param dst Pointer to the destination buffer where transformed tensor data
* will be stored.
*/
-static void ggml_backend_cann_transform_back(
- const ggml_tensor* tensor, void* src, void* dst) {
+static void ggml_backend_cann_transform_back(const ggml_tensor * tensor, void * src, void * dst) {
switch (tensor->type) {
case GGML_TYPE_Q4_0:
ggml_backend_cann_transform_back_q4_0(tensor, src, dst);
* @param buffer The CANN buffer from which to initialize the tensor.
* @param tensor Pointer to the tensor to be initialized.
*/
-static enum ggml_status ggml_backend_cann_buffer_init_tensor(
- ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
+static enum ggml_status ggml_backend_cann_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
if (tensor->view_src != NULL && tensor->view_offs == 0) {
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
return GGML_STATUS_SUCCESS;
if (ggml_is_quantized(tensor->type)) {
// Initialize padding to 0 to avoid possible NaN values
size_t original_size = ggml_nbytes(tensor);
- size_t padded_size =
- ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
+ size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
if (padded_size > original_size && tensor->view_src == nullptr) {
size_t memset_size = padded_size - original_size;
- ACL_CHECK(aclrtMemset((char*)tensor->data + original_size,
- memset_size, 0, memset_size));
+ ACL_CHECK(aclrtMemset((char *) tensor->data + original_size, memset_size, 0, memset_size));
}
}
return GGML_STATUS_SUCCESS;
* designed to be used with a global array, one per device.
*/
struct ggml_cann_nz_workspace {
- void* ptr; // Pointer to allocated device buffer
- size_t allocated; // Size of currently allocated buffer in bytes
+ void * ptr; // Pointer to allocated device buffer
+ size_t allocated; // Size of currently allocated buffer in bytes
/**
* @brief Constructor. Initializes the workspace with no allocated memory.
void clear() {
if (ptr) {
ACL_CHECK(aclrtFree(ptr));
- ptr = nullptr;
+ ptr = nullptr;
allocated = 0;
}
}
*
* @return Pointer to the allocated buffer, or nullptr if not allocated.
*/
- void* get() const { return ptr; }
+ void * get() const { return ptr; }
};
/**
* @note The workspace buffer used in this function is managed globally and reused
* across calls. This reduces overhead from repeated memory allocation and deallocation.
*/
-static void weight_format_to_nz(ggml_tensor *tensor, size_t offset, int device) {
- aclTensor* weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne,
- tensor->nb, 2, ACL_FORMAT_ND, offset);
- uint64_t workspaceSize = 0;
- aclOpExecutor *executor;
+static void weight_format_to_nz(ggml_tensor * tensor, size_t offset, int device) {
+ aclTensor * weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne, tensor->nb, 2, ACL_FORMAT_ND, offset);
+ uint64_t workspaceSize = 0;
+ aclOpExecutor * executor;
// TransMatmulWeight
- ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed,
- &workspaceSize, &executor));
+ ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, &workspaceSize, &executor));
// Avoid frequent malloc/free of the workspace.
g_nz_workspaces[device].realloc(workspaceSize);
- void* g_nz_workspace = g_nz_workspaces[device].get();
+ void * g_nz_workspace = g_nz_workspaces[device].get();
ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr));
ACL_CHECK(aclDestroyTensor(weightTransposed));
* @param offset Offset in the source data from where to start copying.
* @param size Size of the data to be copied, in bytes.
*/
-static void ggml_backend_cann_buffer_set_tensor(
- ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data,
- size_t offset, size_t size) {
- ggml_backend_cann_buffer_context *ctx =
- (ggml_backend_cann_buffer_context *)buffer->context;
+static void ggml_backend_cann_buffer_set_tensor(ggml_backend_buffer_t buffer,
+ ggml_tensor * tensor,
+ const void * data,
+ size_t offset,
+ size_t size) {
+ ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
ggml_cann_set_device(ctx->device);
// TODO: refer to cann(#6017), it use thread's default stream.
// Only check env once.
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
if (!need_transform(tensor->type)) {
- ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
- ACL_MEMCPY_HOST_TO_DEVICE));
- if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
+ ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE));
+ if (weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor)) {
GGML_ASSERT(tensor->ne[2] == 1);
GGML_ASSERT(tensor->ne[3] == 1);
weight_format_to_nz(tensor, offset, ctx->device);
}
} else {
- void *transform_buffer = malloc(size);
+ void * transform_buffer = malloc(size);
ggml_backend_cann_transform(tensor, data, transform_buffer);
- ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size,
- transform_buffer, size,
- ACL_MEMCPY_HOST_TO_DEVICE));
+ ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, transform_buffer, size, ACL_MEMCPY_HOST_TO_DEVICE));
free(transform_buffer);
}
}
* @param offset Offset in the destination buffer where to start copying.
* @param size Size of the data to be copied, in bytes.
*/
-static void ggml_backend_cann_buffer_get_tensor(
- ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data,
- size_t offset, size_t size) {
- ggml_backend_cann_buffer_context* ctx =
- (ggml_backend_cann_buffer_context*)buffer->context;
+static void ggml_backend_cann_buffer_get_tensor(ggml_backend_buffer_t buffer,
+ const ggml_tensor * tensor,
+ void * data,
+ size_t offset,
+ size_t size) {
+ ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
ggml_cann_set_device(ctx->device);
if (!need_transform(tensor->type)) {
- ACL_CHECK(aclrtMemcpy(data, size, (char*)tensor->data + offset, size,
- ACL_MEMCPY_DEVICE_TO_HOST));
+ ACL_CHECK(aclrtMemcpy(data, size, (char *) tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST));
} else {
- void* transform_buffer = malloc(size);
- ACL_CHECK(aclrtMemcpy(transform_buffer, size,
- (char*)tensor->data + offset, size,
- ACL_MEMCPY_DEVICE_TO_HOST));
+ void * transform_buffer = malloc(size);
+ ACL_CHECK(aclrtMemcpy(transform_buffer, size, (char *) tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST));
ggml_backend_cann_transform_back(tensor, transform_buffer, data);
free(transform_buffer);
}
* @param dst Pointer to the destination tensor where the data will be copied.
* @return true if the copy operation succeeded, false otherwise.
*/
-static bool ggml_backend_cann_buffer_cpy_tensor(
- ggml_backend_buffer_t buffer, const ggml_tensor* src, ggml_tensor* dst) {
+static bool ggml_backend_cann_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
+ const ggml_tensor * src,
+ ggml_tensor * dst) {
if (ggml_backend_buffer_is_cann(src->buffer)) {
- ggml_backend_cann_buffer_context* src_ctx =
- (ggml_backend_cann_buffer_context*)src->buffer->context;
- ggml_backend_cann_buffer_context* dst_ctx =
- (ggml_backend_cann_buffer_context*)buffer->context;
+ ggml_backend_cann_buffer_context * src_ctx = (ggml_backend_cann_buffer_context *) src->buffer->context;
+ ggml_backend_cann_buffer_context * dst_ctx = (ggml_backend_cann_buffer_context *) buffer->context;
size_t memcpy_size = ggml_nbytes(src);
// Same device.
if (src_ctx->device == dst_ctx->device) {
- ACL_CHECK(aclrtMemcpy((char*)dst->data, memcpy_size,
- (const char*)src->data, memcpy_size,
+ ACL_CHECK(aclrtMemcpy((char *) dst->data, memcpy_size, (const char *) src->data, memcpy_size,
ACL_MEMCPY_DEVICE_TO_DEVICE));
return true;
} else {
#endif
// Different device but can access by peer.
int32_t canAccessPeer = 0;
- ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device,
- dst_ctx->device));
+ ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device, dst_ctx->device));
if (canAccessPeer) {
ggml_cann_set_device(src_ctx->device);
ACL_CHECK(aclrtDeviceEnablePeerAccess(dst_ctx->device, 0));
- ACL_CHECK(aclrtMemcpy((char*)dst->data, memcpy_size,
- (const char*)src->data, memcpy_size,
+ ACL_CHECK(aclrtMemcpy((char *) dst->data, memcpy_size, (const char *) src->data, memcpy_size,
ACL_MEMCPY_DEVICE_TO_DEVICE));
return true;
}
* @param buffer The CANN buffer to be cleared.
* @param value The value to which each byte in the buffer will be set.
*/
-static void ggml_backend_cann_buffer_clear(
- ggml_backend_buffer_t buffer, uint8_t value) {
- ggml_backend_cann_buffer_context* ctx =
- (ggml_backend_cann_buffer_context*)buffer->context;
+static void ggml_backend_cann_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+ ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
ggml_cann_set_device(ctx->device);
ACL_CHECK(aclrtMemset(ctx->dev_ptr, buffer->size, value, buffer->size));
* buffer type.
*/
struct ggml_backend_cann_buffer_type_context {
- int32_t
- device; /**< Device identifier associated with the buffer context. */
- std::string name; /**< Name associated with the buffer context. */
+ int32_t device; /**< Device identifier associated with the buffer context. */
+ std::string name; /**< Name associated with the buffer context. */
};
/**
* @param buft Pointer to the buffer type context.
* @return Const pointer to the C-style string containing the name.
*/
-static const char* ggml_backend_cann_buffer_type_name(
- ggml_backend_buffer_type_t buft) {
- ggml_backend_cann_buffer_type_context* buft_ctx =
- (ggml_backend_cann_buffer_type_context*)buft->context;
+static const char * ggml_backend_cann_buffer_type_name(ggml_backend_buffer_type_t buft) {
+ ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
return buft_ctx->name.c_str();
}
* @param size Size in bytes of the buffer to allocate.
* @return Pointer to the allocated buffer, or nullptr if allocation fails.
*/
-static ggml_backend_buffer_t
-ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
- size_t size) {
- ggml_backend_cann_buffer_type_context* buft_ctx =
- (ggml_backend_cann_buffer_type_context*)buft->context;
+static ggml_backend_buffer_t ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+ ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
ggml_cann_set_device(buft_ctx->device);
const size_t alignment = 128;
- size = GGML_PAD(size, alignment);
+ size = GGML_PAD(size, alignment);
if (size == 0) {
size = alignment;
}
- void* dev_ptr;
+ void * dev_ptr;
aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
if (err != ACL_SUCCESS) {
- GGML_LOG_ERROR(
- "%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n",
- __func__, size / 1024.0 / 1024.0, buft_ctx->device,
- aclGetRecentErrMsg());
+ GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n", __func__,
+ size / 1024.0 / 1024.0, buft_ctx->device, aclGetRecentErrMsg());
return nullptr;
}
- ggml_backend_cann_buffer_context* ctx =
- new ggml_backend_cann_buffer_context(buft_ctx->device, dev_ptr);
+ ggml_backend_cann_buffer_context * ctx = new ggml_backend_cann_buffer_context(buft_ctx->device, dev_ptr);
- return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface,
- ctx, size);
+ return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface, ctx, size);
}
/**
* @return The alignment requirement in bytes (fixed at 128 bytes for CANN
* buffers).
*/
-static size_t ggml_backend_cann_buffer_type_get_alignment(
- ggml_backend_buffer_type_t buft) {
+static size_t ggml_backend_cann_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
return 128;
GGML_UNUSED(buft);
* @return The total allocation size in bytes required for the tensor in the
* CANN buffer.
*/
-static size_t ggml_backend_cann_buffer_type_get_alloc_size(
- ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
- size_t size = ggml_nbytes(tensor);
- int64_t ne0 = tensor->ne[0];
+static size_t ggml_backend_cann_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft,
+ const ggml_tensor * tensor) {
+ size_t size = ggml_nbytes(tensor);
+ int64_t ne0 = tensor->ne[0];
// Only check env once.
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
// size += (line_size_align_32 - line_size);
if (ggml_is_quantized(tensor->type)) {
if (ne0 % MATRIX_ROW_PADDING != 0) {
- size += ggml_row_size(
- tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
}
- } else if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
+ } else if (weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor)) {
// NZ format weight are not support quantized yet.
// If ND tensor transform to NZ, size may changed.
- int64_t shape[] = {tensor->ne[1], tensor->ne[0]};
+ int64_t shape[] = { tensor->ne[1], tensor->ne[0] };
GGML_ASSERT(tensor->ne[2] == 1);
GGML_ASSERT(tensor->ne[3] == 1);
- const aclIntArray *acl_shape = aclCreateIntArray(shape, 2);
- size_t new_size;
- ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(acl_shape,
- ggml_cann_type_mapping(tensor->type), &new_size));
+ const aclIntArray * acl_shape = aclCreateIntArray(shape, 2);
+ size_t new_size;
+ ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(acl_shape, ggml_cann_type_mapping(tensor->type), &new_size));
ACL_CHECK(aclDestroyIntArray(acl_shape));
size = std::max(size, new_size);
}
* @return A pointer to the buffer type interface for the specified device, or
* nullptr if the device index is out of range.
*/
-ggml_backend_buffer_type_t
-ggml_backend_cann_buffer_type(int32_t device) {
- static std::mutex mutex;
+ggml_backend_buffer_type_t ggml_backend_cann_buffer_type(int32_t device) {
+ static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
if (device >= ggml_backend_cann_get_device_count()) {
return nullptr;
}
- static ggml_backend_buffer_type
- ggml_backend_cann_buffer_types[GGML_CANN_MAX_DEVICES];
+ static ggml_backend_buffer_type ggml_backend_cann_buffer_types[GGML_CANN_MAX_DEVICES];
static bool ggml_backend_cann_buffer_type_initialized = false;
/* .iface = */ ggml_backend_cann_buffer_type_interface,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), i),
/* .context = */
- new ggml_backend_cann_buffer_type_context{
- i, "CANN" + std::to_string(i)},
+ new ggml_backend_cann_buffer_type_context{ i, "CANN" + std::to_string(i) },
};
}
ggml_backend_cann_buffer_type_initialized = true;
}
const size_t alignment = 128;
- size = GGML_PAD(size, alignment);
+ size = GGML_PAD(size, alignment);
if (size == 0) {
size = alignment;
}
- void * hostPtr = nullptr;
- aclError err = aclrtMallocHost((void **) &hostPtr, size);
+ void * hostPtr = nullptr;
+ aclError err = aclrtMallocHost((void **) &hostPtr, size);
if (err != ACL_SUCCESS) {
- GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
- size / 1024.0 / 1024.0, aclGetRecentErrMsg());
+ GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, size / 1024.0 / 1024.0,
+ aclGetRecentErrMsg());
return nullptr;
}
return hostPtr;
* @param size Size in bytes of the host buffer to allocate.
* @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
*/
-static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
+ size_t size) {
void * hostPtr = ggml_cann_host_malloc(size);
if (hostPtr == nullptr) {
}
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
- buffer->buft = buft;
- buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
+ buffer->buft = buft;
+ buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
return buffer;
}
ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
/* .iface = */ {
- /* .get_name = */ ggml_backend_cann_host_buffer_type_name,
- /* .alloc_buffer = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
- /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
+ /* .get_name = */ ggml_backend_cann_host_buffer_type_name,
+ /* .alloc_buffer = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
- /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
- },
- /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), 0),
+ /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
+ },
+ /* .device = */
+ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), 0),
/* .context = */ nullptr,
};
* stored.
* @return true if the computation was successful; false otherwise.
*/
-static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
- struct ggml_tensor* dst) {
+static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct ggml_tensor * dst) {
switch (dst->op) {
case GGML_OP_REPEAT:
ggml_cann_repeat(ctx, dst);
case GGML_UNARY_OP_SILU:
GGML_CANN_CALL_OP_UNARY(Silu);
break;
- case GGML_UNARY_OP_GELU_QUICK: {
- auto lambda = [](ggml_backend_cann_context& ctx,
- aclTensor* acl_src,
- aclTensor* acl_dst) {
- GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
- };
- ggml_cann_op_unary(lambda, ctx, dst);
- } break;
+ case GGML_UNARY_OP_GELU_QUICK:
+ {
+ auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
+ GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
+ };
+ ggml_cann_op_unary(lambda, ctx, dst);
+ }
+ break;
case GGML_UNARY_OP_TANH:
GGML_CANN_CALL_OP_UNARY(Tanh);
break;
case GGML_GLU_OP_SWIGLU:
GGML_CANN_CALL_OP_UNARY_GATED(Silu);
break;
- case GGML_GLU_OP_GEGLU_QUICK: {
- auto lambda = [](ggml_backend_cann_context& ctx,
- aclTensor* acl_src,
- aclTensor* acl_dst) {
- GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
- };
- ggml_cann_op_unary_gated(lambda, ctx, dst);
- } break;
+ case GGML_GLU_OP_GEGLU_QUICK:
+ {
+ auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
+ GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
+ };
+ ggml_cann_op_unary_gated(lambda, ctx, dst);
+ }
+ break;
default:
return false;
}
* @param backend Pointer to the CANN backend structure.
* @return A pointer to a constant string representing the backend name.
*/
-static const char* ggml_backend_cann_name(ggml_backend_t backend) {
- ggml_backend_cann_context* cann_ctx =
- (ggml_backend_cann_context*)backend->context;
+static const char * ggml_backend_cann_name(ggml_backend_t backend) {
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
return cann_ctx->name.c_str();
}
* @param backend Pointer to the CANN backend structure to be freed.
*/
static void ggml_backend_cann_free(ggml_backend_t backend) {
- ggml_backend_cann_context* cann_ctx =
- (ggml_backend_cann_context*)backend->context;
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
ACL_CHECK(aclrtSynchronizeDevice());
ACL_CHECK(aclrtResetDevice(cann_ctx->device));
delete backend;
}
-
/**
* @brief Sets tensor data asynchronously in the CANN backend.
*
* @param size Size of the data to copy in bytes.
*/
static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
- ggml_tensor *tensor,
- const void *data,
- size_t offset,
- size_t size) {
- ggml_backend_cann_context *cann_ctx =
- (ggml_backend_cann_context *)backend->context;
- ggml_backend_buffer_t buf =
- tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
-
- GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) &&
- "unsupported buffer type");
+ ggml_tensor * tensor,
+ const void * data,
+ size_t offset,
+ size_t size) {
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+
+ GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) && "unsupported buffer type");
GGML_ASSERT(!ggml_is_quantized(tensor->type));
- ggml_cann_async_memcpy(cann_ctx, (char *)tensor->data + offset, data, size,
- ACL_MEMCPY_HOST_TO_DEVICE);
+ ggml_cann_async_memcpy(cann_ctx, (char *) tensor->data + offset, data, size, ACL_MEMCPY_HOST_TO_DEVICE);
}
/**
* @param offset Offset in bytes within the host data.
* @param size Size of the data to copy in bytes.
*/
-static void ggml_backend_cann_get_tensor_async(
- ggml_backend_t backend, const ggml_tensor *tensor, void *data,
- size_t offset, size_t size) {
- ggml_backend_cann_context *cann_ctx =
- (ggml_backend_cann_context *)backend->context;
- ggml_backend_buffer_t buf =
- tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+static void ggml_backend_cann_get_tensor_async(ggml_backend_t backend,
+ const ggml_tensor * tensor,
+ void * data,
+ size_t offset,
+ size_t size) {
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
- GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) &&
- "unsupported buffer type");
+ GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) && "unsupported buffer type");
GGML_ASSERT(!ggml_is_quantized(tensor->type));
- ggml_cann_async_memcpy(cann_ctx, data, (char *)tensor->data + offset, size,
- ACL_MEMCPY_DEVICE_TO_HOST);
-
+ ggml_cann_async_memcpy(cann_ctx, data, (char *) tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST);
}
/**
* @param dst Pointer to the destination tensor to copy data to.
* @return true if the copy operation succeeds, false otherwise.
*/
-static bool ggml_backend_cann_cpy_tensor_async(
- ggml_backend_t backend_src, ggml_backend_t backend_dst,
- const ggml_tensor* src, ggml_tensor* dst) {
- GGML_ASSERT(ggml_backend_is_cann(backend_src) ||
- ggml_backend_is_cann(backend_dst));
+static bool ggml_backend_cann_cpy_tensor_async(ggml_backend_t backend_src,
+ ggml_backend_t backend_dst,
+ const ggml_tensor * src,
+ ggml_tensor * dst) {
+ GGML_ASSERT(ggml_backend_is_cann(backend_src) || ggml_backend_is_cann(backend_dst));
- GGML_ASSERT(!is_matmul_weight((const ggml_tensor*)src));
+ GGML_ASSERT(!is_matmul_weight((const ggml_tensor *) src));
- if (!ggml_backend_buffer_is_cann(src->buffer) ||
- !ggml_backend_buffer_is_cann(dst->buffer)) {
+ if (!ggml_backend_buffer_is_cann(src->buffer) || !ggml_backend_buffer_is_cann(dst->buffer)) {
return false;
}
- ggml_backend_buffer_t buf_src =
- src->view_src ? src->view_src->buffer : src->buffer;
- ggml_backend_buffer_t buf_dst =
- dst->view_src ? dst->view_src->buffer : dst->buffer;
+ ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
+ ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
- ggml_backend_cann_context* cann_ctx_src =
- (ggml_backend_cann_context*)backend_src->context;
- ggml_backend_cann_context* cann_ctx_dst =
- (ggml_backend_cann_context*)backend_dst->context;
+ ggml_backend_cann_context * cann_ctx_src = (ggml_backend_cann_context *) backend_src->context;
+ ggml_backend_cann_context * cann_ctx_dst = (ggml_backend_cann_context *) backend_dst->context;
size_t copy_size = ggml_nbytes(dst);
if (copy_size == 0) {
// TODO: Support 310p P2P copy
return false;
#endif
- ggml_backend_cann_buffer_context* buf_ctx_src =
- (ggml_backend_cann_buffer_context*)buf_src->context;
- ggml_backend_cann_buffer_context* buf_ctx_dst =
- (ggml_backend_cann_buffer_context*)buf_dst->context;
+ ggml_backend_cann_buffer_context * buf_ctx_src = (ggml_backend_cann_buffer_context *) buf_src->context;
+ ggml_backend_cann_buffer_context * buf_ctx_dst = (ggml_backend_cann_buffer_context *) buf_dst->context;
GGML_ASSERT(cann_ctx_src->device == buf_ctx_src->device);
GGML_ASSERT(cann_ctx_dst->device == buf_ctx_dst->device);
int32_t canAccessPeer = 0;
- ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, cann_ctx_src->device,
- cann_ctx_dst->device));
+ ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, cann_ctx_src->device, cann_ctx_dst->device));
if (!canAccessPeer) {
return false;
}
// wait for task_queue empty to keep task order.
cann_ctx_src->task_queue.wait();
- ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
- ACL_MEMCPY_DEVICE_TO_DEVICE,
+ ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, ACL_MEMCPY_DEVICE_TO_DEVICE,
cann_ctx_src->stream()));
// record event on src stream after the copy
// TODO: this event is not effective with acl graph mode, change to use aclrtSynchronizeStream
ACL_CHECK(aclrtSynchronizeStream(cann_ctx_src->stream()));
} else {
// src and dst are on the same backend
- ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
- ACL_MEMCPY_DEVICE_TO_DEVICE,
+ ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, ACL_MEMCPY_DEVICE_TO_DEVICE,
cann_ctx_dst->stream()));
}
* @param backend Pointer to the CANN backend structure to synchronize.
*/
static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
- ggml_backend_cann_context* cann_ctx =
- (ggml_backend_cann_context*)backend->context;
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
cann_ctx->task_queue.wait();
ggml_cann_set_device(cann_ctx->device);
ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
* @param cann_ctx The CANN backend context containing the graph cache.
* @param cgraph The current ggml computation graph.
*/
-static void add_lru_matched_graph_node_properties(
- ggml_backend_cann_context * cann_ctx,
- ggml_cgraph * cgraph) {
+static void add_lru_matched_graph_node_properties(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
// Create a new ggml_cann_graph object on the heap (its lifetime is managed by the cache).
ggml_cann_graph * new_graph = new ggml_cann_graph();
new_graph->ggml_graph_properties.resize(cgraph->n_nodes);
for (int node_idx = 0; node_idx < cgraph->n_nodes; ++node_idx) {
ggml_tensor * node = cgraph->nodes[node_idx];
- auto & prop = new_graph->ggml_graph_properties[node_idx];
+ auto & prop = new_graph->ggml_graph_properties[node_idx];
prop.node_address = node->data;
prop.node_op = node->op;
* @param graph_node_properties The stored properties of a CANN graph node.
* @return true if all fields match (excluding GGML_OP_VIEW); false otherwise.
*/
-static bool ggml_graph_node_has_matching_properties(
- ggml_tensor * node,
- ggml_graph_node_properties * graph_node_properties) {
- if (node->data != graph_node_properties->node_address &&
- node->op != GGML_OP_VIEW) {
+static bool ggml_graph_node_has_matching_properties(ggml_tensor * node,
+ ggml_graph_node_properties * graph_node_properties) {
+ if (node->data != graph_node_properties->node_address && node->op != GGML_OP_VIEW) {
return false;
}
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (node->src[i]) {
- if (node->src[i]->data != graph_node_properties->src_address[i] &&
- node->op != GGML_OP_VIEW) {
+ if (node->src[i]->data != graph_node_properties->src_address[i] && node->op != GGML_OP_VIEW) {
return false;
}
* @return true if a matching cached graph exists; false otherwise.
*/
static bool is_matched_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
- ggml_cann_graph_lru_cache &lru_cache = cann_ctx->graph_lru_cache;
- for (auto &graph_ptr : lru_cache.cache_list) {
+ ggml_cann_graph_lru_cache & lru_cache = cann_ctx->graph_lru_cache;
+ for (auto & graph_ptr : lru_cache.cache_list) {
// Skip graphs with a different number of nodes.
if (graph_ptr->ggml_graph_properties.size() != static_cast<size_t>(cgraph->n_nodes)) {
continue;
* @param use_cann_graph Whether to use CANN graph execution.
* @param cann_graph_update_required Whether graph capture is needed due to graph changes.
*/
-static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph,
- bool & use_cann_graph, bool & cann_graph_update_required) {
+static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx,
+ ggml_cgraph * cgraph,
+ bool & use_cann_graph,
+ bool & cann_graph_update_required) {
#ifdef USE_ACL_GRAPH
- ggml_cann_graph* matched_graph = cann_ctx->graph_lru_cache.cache_list.front();
+ ggml_cann_graph * matched_graph = cann_ctx->graph_lru_cache.cache_list.front();
if (use_cann_graph && cann_graph_update_required) {
ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL));
}
-#endif // USE_ACL_GRAPH
+#endif // USE_ACL_GRAPH
// Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph.
// With the use of CANN graphs, the execution will be performed by the graph launch.
if (!use_cann_graph || cann_graph_update_required) {
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
- if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+ if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE ||
+ node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
continue;
}
}
#ifdef USE_ACL_GRAPH
- if (use_cann_graph && cann_graph_update_required) { // End CANN graph capture
+ if (use_cann_graph && cann_graph_update_required) { // End CANN graph capture
ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph));
}
// Execute graph
ACL_CHECK(aclmdlRIExecuteAsync(matched_graph->graph, cann_ctx->stream()));
}
-#endif // USE_ACL_GRAPH
+#endif // USE_ACL_GRAPH
}
-
/**
* @brief Computes a computational graph using a CANN backend.
*
* @return enum ggml_status Returns GGML_STATUS_SUCCESS if computation
* completes successfully, otherwise an appropriate error status.
*/
-static enum ggml_status ggml_backend_cann_graph_compute(
- ggml_backend_t backend, ggml_cgraph* cgraph) {
- ggml_backend_cann_context* cann_ctx =
- (ggml_backend_cann_context*)backend->context;
+static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
ggml_cann_set_device(cann_ctx->device);
g_nz_workspaces[cann_ctx->device].clear();
cann_ctx->rope_cache.cached = false;
#ifdef USE_ACL_GRAPH
- bool use_cann_graph = true;
+ bool use_cann_graph = true;
bool cann_graph_update_required = false;
static bool prefill_use_graph = parse_bool(get_env("GGML_CANN_PREFILL_USE_GRAPH").value_or(""));
}
}
#else
- bool use_cann_graph = false;
+ bool use_cann_graph = false;
bool cann_graph_update_required = false;
#endif // USE_ACL_GRAPH
- evaluate_and_capture_cann_graph(
- cann_ctx,
- cgraph,
- use_cann_graph,
- cann_graph_update_required
- );
+ evaluate_and_capture_cann_graph(cann_ctx, cgraph, use_cann_graph, cann_graph_update_required);
return GGML_STATUS_SUCCESS;
}
* @return bool Returns true if the operation is supported by the backend,
* otherwise false.
*/
-static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
- const ggml_tensor* op) {
+static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
switch (op->op) {
case GGML_OP_UNARY:
switch (ggml_get_unary_op(op)) {
return false;
}
break;
- case GGML_OP_MUL_MAT: {
- switch (op->src[0]->type) {
- case GGML_TYPE_F16:
- case GGML_TYPE_F32:
- return true;
- case GGML_TYPE_Q8_0:
- case GGML_TYPE_Q4_0:
+ case GGML_OP_MUL_MAT:
+ {
+ switch (op->src[0]->type) {
+ case GGML_TYPE_F16:
+ case GGML_TYPE_F32:
+ return true;
+ case GGML_TYPE_Q8_0:
+ case GGML_TYPE_Q4_0:
#ifdef ASCEND_310P
- // Q4 && Q8 per group is not support on 310p device
- return false;
+ // Q4 && Q8 per group is not support on 310p device
+ return false;
#endif
- // only support contiguous for quantized types.
- return ggml_is_contiguous(op->src[0]) &&
- ggml_is_contiguous(op->src[1]);
- default:
- return false;
+ // only support contiguous for quantized types.
+ return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
+ default:
+ return false;
+ }
}
- }
case GGML_OP_MUL_MAT_ID:
switch (op->src[0]->type) {
case GGML_TYPE_F16:
return false;
#endif
// only support contiguous for quantized types.
- return ggml_is_contiguous(op->src[0]) &&
- ggml_is_contiguous(op->src[1]);
+ return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
default:
return false;
}
// embedding
- case GGML_OP_GET_ROWS: {
- switch (op->src[0]->type) {
- case GGML_TYPE_F32:
- case GGML_TYPE_F16:
- case GGML_TYPE_Q8_0:
- return true;
- default:
- return false;
- }
- } break;
- case GGML_OP_SET_ROWS: {
- switch (op->type) {
- case GGML_TYPE_F32:
- case GGML_TYPE_F16:
- return true;
- default:
- return false;
+ case GGML_OP_GET_ROWS:
+ {
+ switch (op->src[0]->type) {
+ case GGML_TYPE_F32:
+ case GGML_TYPE_F16:
+ case GGML_TYPE_Q8_0:
+ return true;
+ default:
+ return false;
+ }
}
- } break;
- case GGML_OP_CPY: {
- ggml_tensor *src = op->src[0];
- if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
- (src->type != GGML_TYPE_F32 &&
- src->type != GGML_TYPE_F16)) {
- // only support F32 and F16.
- return false;
+ break;
+ case GGML_OP_SET_ROWS:
+ {
+ switch (op->type) {
+ case GGML_TYPE_F32:
+ case GGML_TYPE_F16:
+ return true;
+ default:
+ return false;
+ }
}
- return true;
- } break;
- case GGML_OP_CONT: {
- // TODO: support GGML_TYPE_BF16
- switch (op->src[0]->type) {
- case GGML_TYPE_F32:
- case GGML_TYPE_F16:
- return true;
- default:
+ break;
+ case GGML_OP_CPY:
+ {
+ ggml_tensor * src = op->src[0];
+ if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
+ (src->type != GGML_TYPE_F32 && src->type != GGML_TYPE_F16)) {
+ // only support F32 and F16.
return false;
+ }
+ return true;
}
- }
- case GGML_OP_ROPE: {
- // TODO: with ops-test v == 1
- // TODO: n_dims <= ne0
- if (op->src[0]->ne[0] != op->op_params[1]) {
- return false;
+ break;
+ case GGML_OP_CONT:
+ {
+ // TODO: support GGML_TYPE_BF16
+ switch (op->src[0]->type) {
+ case GGML_TYPE_F32:
+ case GGML_TYPE_F16:
+ return true;
+ default:
+ return false;
+ }
}
+ case GGML_OP_ROPE:
+ {
+ // TODO: with ops-test v == 1
+ // TODO: n_dims <= ne0
+ if (op->src[0]->ne[0] != op->op_params[1]) {
+ return false;
+ }
- const int mode = ((const int32_t *) op->op_params)[2];
- if (mode & GGML_ROPE_TYPE_MROPE) {
- return false;
- }
- if (mode & GGML_ROPE_TYPE_VISION) {
- return false;
- }
+ const int mode = ((const int32_t *) op->op_params)[2];
+ if (mode & GGML_ROPE_TYPE_MROPE) {
+ return false;
+ }
+ if (mode & GGML_ROPE_TYPE_VISION) {
+ return false;
+ }
#ifdef ASCEND_310P
- if(!ggml_is_contiguous(op->src[0])){
- return false;
- }
+ if (!ggml_is_contiguous(op->src[0])) {
+ return false;
+ }
#endif
- return true;
- }
- case GGML_OP_UPSCALE: {
- // aclnnUpsampleNearest2dGetWorkspaceSize not support
- // selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal
- if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
- return false;
+ return true;
}
- if (op->op_params[0] != GGML_SCALE_MODE_NEAREST) {
- return false;
+ case GGML_OP_UPSCALE:
+ {
+ // aclnnUpsampleNearest2dGetWorkspaceSize not support
+ // selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal
+ if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
+ return false;
+ }
+ if (op->op_params[0] != GGML_SCALE_MODE_NEAREST) {
+ return false;
+ }
+ return true;
}
- return true;
- }
- case GGML_OP_POOL_2D: {
- const int32_t * opts = (const int32_t *) op->op_params;
+ case GGML_OP_POOL_2D:
+ {
+ const int32_t * opts = (const int32_t *) op->op_params;
#ifdef ASCEND_310P
- enum ggml_op_pool opt = static_cast<ggml_op_pool>(opts[0]);
- if(opt == GGML_OP_POOL_MAX){
- return false;
- }
+ enum ggml_op_pool opt = static_cast<ggml_op_pool>(opts[0]);
+ if (opt == GGML_OP_POOL_MAX) {
+ return false;
+ }
#endif
- const int k0 = opts[1];
- const int k1 = opts[2];
- const int p0 = opts[5];
- const int p1 = opts[6];
- // value of paddingH should be at most half of kernelH
- // value of paddingW should be at most half of kernelW
- return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
- }
+ const int k0 = opts[1];
+ const int k1 = opts[2];
+ const int p0 = opts[5];
+ const int p1 = opts[6];
+ // value of paddingH should be at most half of kernelH
+ // value of paddingW should be at most half of kernelW
+ return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
+ }
case GGML_OP_DUP:
case GGML_OP_SUM:
case GGML_OP_IM2COL:
return (op->src[0]->ne[0] - 1) <= 255;
case GGML_OP_SCALE:
float bias;
- memcpy(&bias, (const float *)(op->op_params) + 1, sizeof(float));
- return bias == 0.0f; // TODO: support bias != 0.0f
+ memcpy(&bias, (const float *) (op->op_params) + 1, sizeof(float));
+ return bias == 0.0f; // TODO: support bias != 0.0f
case GGML_OP_SOFT_MAX:
// TODO: support attention sinks [TAG_ATTN_SINKS]
if (op->src[2]) {
return false;
}
return true;
- case GGML_OP_FLASH_ATTN_EXT:{
+ case GGML_OP_FLASH_ATTN_EXT:
+ {
#ifdef ASCEND_310P
- // FA not support on 310p device
- return false;
-#endif
- // derived from [ggml-cuda.cu]
- if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){
- return false;
- }
- if(op->src[1]->type != GGML_TYPE_F16 && op->src[1]->type != GGML_TYPE_F32 && op->src[1]->type != GGML_TYPE_BF16){
- return false;
- }
- if(op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_BF16){
- return false;
- }
- // TODO: support attention sinks [TAG_ATTN_SINKS]
- if (op->src[4]) {
- return false;
- }
- if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
- // different head sizes of K and V are not supported yet
- return false;
- }
- if (op->src[0]->ne[0] % 16 != 0) {
- // TODO: padding to support
- return false;
- }
- float logitSoftcap = 0.0f;
- memcpy(&logitSoftcap, (const float *)(op->op_params) + 2, sizeof(float));
- if(logitSoftcap != 0.0f) {
+ // FA not support on 310p device
return false;
+#endif
+ // derived from [ggml-cuda.cu]
+ if (op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16) {
+ return false;
+ }
+ if (op->src[1]->type != GGML_TYPE_F16 && op->src[1]->type != GGML_TYPE_F32 &&
+ op->src[1]->type != GGML_TYPE_BF16) {
+ return false;
+ }
+ if (op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_BF16) {
+ return false;
+ }
+ // TODO: support attention sinks [TAG_ATTN_SINKS]
+ if (op->src[4]) {
+ return false;
+ }
+ if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
+ // different head sizes of K and V are not supported yet
+ return false;
+ }
+ if (op->src[0]->ne[0] % 16 != 0) {
+ // TODO: padding to support
+ return false;
+ }
+ float logitSoftcap = 0.0f;
+ memcpy(&logitSoftcap, (const float *) (op->op_params) + 2, sizeof(float));
+ if (logitSoftcap != 0.0f) {
+ return false;
+ }
+ return true;
}
- return true;
- }
default:
return false;
}
* @return bool Returns true if the operation should be offloaded, otherwise
* false.
*/
-static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev,
- const ggml_tensor* op) {
+static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
const int min_batch_size = 32;
GGML_UNUSED(dev);
* @param event Pointer to the event structure to be recorded.
*/
static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
- ggml_backend_cann_context* cann_ctx =
- (ggml_backend_cann_context*)backend->context;
- ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream()));
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
+ ACL_CHECK(aclrtRecordEvent((aclrtEvent) event->context, cann_ctx->stream()));
}
/**
* @param event Pointer to the event structure that the backend needs to wait
* for.
*/
-static void ggml_backend_cann_event_wait(ggml_backend_t backend,
- ggml_backend_event_t event) {
- ggml_backend_cann_context* cann_ctx =
- (ggml_backend_cann_context*)backend->context;
+static void ggml_backend_cann_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
+ ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
if (ggml_backend_is_cann(backend)) {
- ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(),
- (aclrtEvent)event->context));
+ ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(), (aclrtEvent) event->context));
} else {
GGML_ABORT("fatal error");
}
* @return A pointer to the static GUID.
*/
static ggml_guid_t ggml_backend_cann_guid() {
- static ggml_guid guid = {0xa1, 0x94, 0xaf, 0xac, 0xbd, 0x4f, 0x47, 0x34,
- 0xbe, 0x1a, 0x9e, 0x71, 0x1f, 0x9e, 0xed, 0x64};
+ static ggml_guid guid = { 0xa1, 0x94, 0xaf, 0xac, 0xbd, 0x4f, 0x47, 0x34,
+ 0xbe, 0x1a, 0x9e, 0x71, 0x1f, 0x9e, 0xed, 0x64 };
return &guid;
}
// backend device
struct ggml_backend_cann_device_context {
- int device;
+ int device;
std::string name;
std::string description;
};
static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
- ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
return ctx->name.c_str();
}
-static const char* ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) {
- ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+static const char * ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) {
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
return ctx->description.c_str();
}
static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
- ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
ggml_backend_cann_get_device_memory(ctx->device, free, total);
}
static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, const char * params) {
GGML_UNUSED(params);
- ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
return ggml_backend_cann_init(ctx->device);
}
* @return bool Returns true if the CANN backend supports the buffer type,
* otherwise false.
*/
-static bool ggml_backend_cann_supports_buft(
- ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+static bool ggml_backend_cann_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
if (ggml_backend_buft_is_cann(buft)) {
- ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
- ggml_backend_cann_buffer_type_context * buft_ctx =
- (ggml_backend_cann_buffer_type_context *)buft->context;
+ ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *) dev->context;
+ ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
return buft_ctx->device == dev_ctx->device;
}
return false;
}
static ggml_backend_buffer_type_t ggml_backend_cann_device_get_buffer_type(ggml_backend_dev_t dev) {
- ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+ ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
return ggml_backend_cann_buffer_type(ctx->device);
}
* @param backend Pointer to the CANN backend.
* @return ggml_backend_event_t Returns a pointer to the new event structure.
*/
-static ggml_backend_event_t ggml_backend_cann_device_event_new(
- ggml_backend_dev_t dev) {
- ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
+static ggml_backend_event_t ggml_backend_cann_device_event_new(ggml_backend_dev_t dev) {
+ ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *) dev->context;
ggml_cann_set_device(dev_ctx->device);
* @param event Pointer to the event structure to be freed.
*/
static void ggml_backend_cann_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
- ACL_CHECK(aclrtDestroyEvent((aclrtEvent)event->context));
+ ACL_CHECK(aclrtDestroyEvent((aclrtEvent) event->context));
delete event;
GGML_UNUSED(dev);
* @param event Pointer to the event structure to be synchronized.
*/
static void ggml_backend_cann_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
- ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent)event->context));
+ ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent) event->context));
GGML_UNUSED(dev);
}
/* .get_memory = */ ggml_backend_cann_device_get_memory,
/* .get_type = */ ggml_backend_cann_device_get_type,
/* .get_props = */ ggml_backend_cann_device_get_props,
- /* .init_backend = */ ggml_backend_cann_device_init, // called for every card
+ /* .init_backend = */ ggml_backend_cann_device_init, // called for every card
/* .get_buffer_type = */ ggml_backend_cann_device_get_buffer_type,
/* .get_host_buffer_type = */ ggml_backend_cann_device_get_host_buffer_type,
- /* .buffer_from_host_ptr = */ NULL, // not supported for CANN
+ /* .buffer_from_host_ptr = */ NULL, // not supported for CANN
/* .supports_op = */ ggml_backend_cann_supports_op,
/* .supports_buft = */ ggml_backend_cann_supports_buft,
/* .offload_op = */ ggml_backend_cann_offload_op,
/* .event_synchronize = */ ggml_backend_cann_device_event_synchronize,
};
-
// backend reg
struct ggml_backend_cann_reg_context {
std::vector<ggml_backend_dev_t> devices;
}
static size_t ggml_backend_cann_reg_get_device_count(ggml_backend_reg_t reg) {
- ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
+ ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *) reg->context;
return ctx->devices.size();
}
static ggml_backend_dev_t ggml_backend_cann_reg_get_device(ggml_backend_reg_t reg, size_t index) {
- ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
+ ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *) reg->context;
GGML_ASSERT(index < ctx->devices.size());
return ctx->devices[index];
}
// backend registry, called only once for cann backend
ggml_backend_reg_t ggml_backend_cann_reg() {
static ggml_backend_reg reg;
- static bool initialized = false;
+ static bool initialized = false;
{
- static std::mutex mutex;
+ static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
if (!initialized) {
aclInit(nullptr);
ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
for (int i = 0; i < ggml_cann_info().device_count; i++) {
- ggml_backend_cann_device_context* dev_ctx = new ggml_backend_cann_device_context();
- dev_ctx->description = aclrtGetSocName();
- dev_ctx->device = i;
- dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
+ ggml_backend_cann_device_context * dev_ctx = new ggml_backend_cann_device_context();
+ dev_ctx->description = aclrtGetSocName();
+ dev_ctx->device = i;
+ dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
ggml_cann_set_device(i);
- ggml_backend_dev_t dev = new ggml_backend_device {
- /* .iface = */ ggml_backend_cann_device_interface,
- /* .reg = */ ®,
- /* .context = */ dev_ctx
- };
+ ggml_backend_dev_t dev = new ggml_backend_device{ /* .iface = */ ggml_backend_cann_device_interface,
+ /* .reg = */ ®,
+ /* .context = */ dev_ctx };
ctx->devices.push_back(dev);
}
- reg = ggml_backend_reg {
- /* .api_version = */ GGML_BACKEND_API_VERSION,
- /* .iface = */ ggml_backend_cann_reg_interface,
- /* .context = */ ctx
- };
+ reg = ggml_backend_reg{ /* .api_version = */ GGML_BACKEND_API_VERSION,
+ /* .iface = */ ggml_backend_cann_reg_interface,
+ /* .context = */ ctx };
}
initialized = true;
return nullptr;
}
- ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device);
+ ggml_backend_cann_context * ctx = new ggml_backend_cann_context(device);
if (ctx == nullptr) {
GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
return nullptr;
}
ggml_cann_set_device(ctx->device);
ggml_backend_t cann_backend =
- new ggml_backend{/* .guid = */ ggml_backend_cann_guid(),
- /* .interface = */ ggml_backend_cann_interface,
- /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
- /* .context = */ ctx};
+ new ggml_backend{ /* .guid = */ ggml_backend_cann_guid(),
+ /* .interface = */ ggml_backend_cann_interface,
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
+ /* .context = */ ctx };
return cann_backend;
}
bool ggml_backend_is_cann(ggml_backend_t backend) {
- return backend != NULL &&
- ggml_guid_matches(backend->guid, ggml_backend_cann_guid());
+ return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cann_guid());
}
int32_t ggml_backend_cann_get_device_count() {
return ggml_cann_info().device_count;
}
-void ggml_backend_cann_get_device_description(
- int32_t device, char* description, size_t description_size) {
+void ggml_backend_cann_get_device_description(int32_t device, char * description, size_t description_size) {
ggml_cann_set_device(device);
- const char* soc_name = aclrtGetSocName();
+ const char * soc_name = aclrtGetSocName();
snprintf(description, description_size, "%s", soc_name);
}
-void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
- size_t* total) {
+void ggml_backend_cann_get_device_memory(int32_t device, size_t * free, size_t * total) {
ggml_cann_set_device(device);
ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
}