* @note The workspace buffer used in this function is managed globally and reused
* across calls. This reduces overhead from repeated memory allocation and deallocation.
*/
-static void weight_format_to_nz(ggml_tensor *tensor, const void *data, size_t offset) {
+static void weight_format_to_nz(ggml_tensor *tensor, size_t offset) {
aclTensor* weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne,
tensor->nb, 2, ACL_FORMAT_ND, offset);
uint64_t workspaceSize = 0;
if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
GGML_ASSERT(tensor->ne[2] == 1);
GGML_ASSERT(tensor->ne[3] == 1);
- weight_format_to_nz(tensor, data, offset);
+ weight_format_to_nz(tensor, offset);
}
} else {
void *transform_buffer = malloc(size);
return true;
case GGML_OP_SCALE:
float bias;
- memcpy(&bias, (float*)op->op_params + 1, sizeof(float));
+ memcpy(&bias, (const float *)(op->op_params) + 1, sizeof(float));
return bias == 0.0f; // TODO: support bias != 0.0f
case GGML_OP_SOFT_MAX:
// TODO: support attention sinks [TAG_ATTN_SINKS]
return false;
}
float logitSoftcap = 0.0f;
- memcpy(&logitSoftcap, (float*)op->op_params + 2, sizeof(float));
+ memcpy(&logitSoftcap, (const float *)(op->op_params) + 2, sizeof(float));
if(logitSoftcap != 0.0f) {
return false;
}