size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
bcast_weight_nb[2], bcast_weight_nb[3],
bcast_weight_nb[4], bcast_weight_nb[5]};
- aclTensor* acl_weight_tensor =
- ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
+ aclTensor* acl_weight_tensor;
+
+ bool weightToNZ = false;
+#ifdef ASCEND_310P
+ weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
+#endif
+ if (weightToNZ && is_matmul_weight(weight)) {
+ int64_t acl_stride[2] = {1, transpose_ne[1]};
+
+ // Reverse ne.
+ std::reverse(transpose_ne, transpose_ne + n_dims);
+
+ std::vector<int64_t> storageDims = {transpose_ne[0], transpose_ne[1]};
+
+ acl_weight_tensor = aclCreateTensor(
+ transpose_ne, n_dims, ggml_cann_type_mapping(weight->type), acl_stride,
+ 0, ACL_FORMAT_FRACTAL_NZ, storageDims.data(), 2, weight->data);
+ } else {
+ acl_weight_tensor =
+ ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND);
+ }
aclTensor* acl_dst =
ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
#ifndef CANN_ACLNN_OPS
#define CANN_ACLNN_OPS
+#include <unordered_set>
#include <functional>
#include <aclnnop/aclnn_abs.h>
#include <aclnnop/aclnn_neg.h>
*/
void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+/**
+ * @brief Check whether a tensor is a weight tensor for matrix multiplication.
+ *
+ * @details Checks whether the given tensor serves as weight parameters in matrix multiplication operations,
+ * typically within neural network layers. The function maintains a static set of canonical weight
+ * naming suffixes from Transformer-based architectures. Uses substring matching to identify weight
+ * tensors even with hierarchical naming patterns.
+ *
+ * @param tensor Pointer to the target ggml_tensor object (const-qualified).
+ */
+static bool is_matmul_weight(const ggml_tensor* tensor) {
+ std::string name = ggml_get_name(tensor);
+ static const std::unordered_set<std::string> weight_suffixes{
+ "output.weight",
+ "attn_q.weight",
+ "attn_k.weight",
+ "attn_v.weight",
+ "attn_output.weight",
+ "ffn_gate.weight",
+ "ffn_up.weight",
+ "ffn_down.weight"
+ };
+
+ for (const auto& suffix : weight_suffixes) {
+ if (name.find(suffix) != std::string::npos) {
+ return true;
+ }
+ }
+ return false;
+}
+
/**
* @brief Applies a element-wise operation to two input tensors using the CANN
* backend.
#include <acl/acl.h>
#include <stdarg.h>
+#include <aclnnop/aclnn_trans_matmul_weight.h>
#include <cmath>
#include <cstdio>
return GGML_STATUS_SUCCESS;
}
+static int CreateAclTensorWeight(const void *hostData, const std::vector<int64_t> &shape, void **deviceAddr,
+ aclDataType dataType, aclTensor **tensor)
+{
+ uint64_t size = 1;
+ for (auto i : shape) {
+ size *= i;
+ }
+
+ const aclIntArray *mat2Size = aclCreateIntArray(shape.data(), shape.size());
+ ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(mat2Size, dataType, &size));
+
+ size *= sizeof(int16_t);
+
+ ACL_CHECK(aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST));
+ aclrtMemcpy(*deviceAddr, size, hostData, size, ACL_MEMCPY_HOST_TO_DEVICE);
+
+ std::vector<int64_t> strides(shape.size(), 1);
+ for (int64_t i = shape.size() - 2; i >= 0; i--) {
+ strides[i] = shape[i + 1] * strides[i + 1];
+ }
+
+ *tensor = aclCreateTensor(shape.data(), shape.size(), dataType, strides.data(), 0, aclFormat::ACL_FORMAT_ND,
+ shape.data(), shape.size(), *deviceAddr);
+ return 0;
+}
+
+static void weight_format_to_nz(ggml_tensor *tensor, const void *data, size_t offset) {
+ aclrtStream stream;
+ ACL_CHECK(aclrtCreateStream(&stream));
+
+ std::vector<int64_t> weightTransposedShape = {tensor->ne[1], tensor->ne[0]};
+ void *weightTransposedDeviceAddr = nullptr;
+ aclTensor *weightTransposed = nullptr;
+ CreateAclTensorWeight(data, weightTransposedShape, &weightTransposedDeviceAddr,
+ ggml_cann_type_mapping(tensor->type), &weightTransposed);
+
+ uint64_t workspaceSize = 0;
+ aclOpExecutor *executor;
+ void *workspaceAddr = nullptr;
+
+ // TransMatmulWeight
+ ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, &workspaceSize, &executor));
+ std::unique_ptr<void, aclError (*)(void *)> workspaceAddrPtrTrans(nullptr, aclrtFree);
+ if (workspaceSize > 0) {
+ ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
+ workspaceAddrPtrTrans.reset(workspaceAddr);
+ }
+ ACL_CHECK(aclnnTransMatmulWeight(workspaceAddr, workspaceSize, executor, stream));
+
+ size_t size = ggml_nelements(tensor) * ggml_element_size(tensor);
+
+ aclrtMemcpy((char *)tensor->data + offset, size,
+ weightTransposedDeviceAddr, size, ACL_MEMCPY_HOST_TO_DEVICE);
+ ACL_CHECK(aclDestroyTensor(weightTransposed));
+ aclrtFree(weightTransposedDeviceAddr);
+}
+
// TODO: need handle tensor which has paddings.
/**
* @brief Set tensor data in a CANN buffer.
// For acl, synchronous functions use this default stream.
// Why aclrtSynchronizeDevice?
+ bool weightToNZ = false;
+#ifdef ASCEND_310P
+ weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
+#endif
if (!need_transform(tensor->type)) {
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
ACL_MEMCPY_HOST_TO_DEVICE));
+ if (weightToNZ && is_matmul_weight((const ggml_tensor*)tensor)) {
+ weight_format_to_nz(tensor, data, offset);
+ }
} else {
void *transform_buffer = malloc(size);
ggml_backend_cann_transform(tensor, data, transform_buffer);