ggml_tensor * src0 = dst->src[0]; // src
ggml_tensor * src1 = dst->src[1]; // index
- GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16
+ || dst->type == GGML_TYPE_BF16);
switch (src0->type) {
+ case GGML_TYPE_BF16:
case GGML_TYPE_F16:
case GGML_TYPE_F32:
if (src0->type == dst->type) {
break;
}
case GGML_TYPE_F16:
+ case GGML_TYPE_BF16:
{
acl_tensor_ptr acl_src0 = ggml_cann_create_tensor(src0);
ggml_cann_pool_alloc src_buffer_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(uint16_t));
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
}
acl_tensor_ptr src_trans_tensor = ggml_cann_create_tensor(
- src_trans_buffer, ACL_FLOAT16, ggml_type_size(dst->type), src0->ne, src_trans_nb, GGML_MAX_DIMS);
+ src_trans_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type), src0->ne, src_trans_nb, GGML_MAX_DIMS);
aclnn_cast(ctx, acl_src0.get(), src_trans_tensor.get(), ggml_cann_type_mapping(dst->type));
aclnn_index_copy_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb, dst->data, dst->ne, dst->nb, src1,
dst->type);
// Only check env once.
static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
- if (weight_to_nz && is_matmul_weight(weight)) {
+ if (weight_to_nz && weight->type != GGML_TYPE_BF16 && is_matmul_weight(weight)) {
acl_weight_tensor = ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_FRACTAL_NZ);
} else {
acl_weight_tensor = ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND);
switch (type) {
case GGML_TYPE_F32:
case GGML_TYPE_F16:
+#ifndef ASCEND_310P
+ case GGML_TYPE_BF16:
+#endif
ggml_cann_mat_mul_fp(ctx, dst);
break;
case GGML_TYPE_Q4_0:
static bool weight_to_nz = parse_bool(get_env_as_lowercase("GGML_CANN_WEIGHT_NZ").value_or("on"));
if (!need_transform(tensor->type)) {
ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE));
- if (weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor)) {
+ if (weight_to_nz && tensor->type != GGML_TYPE_BF16
+ && is_matmul_weight((const ggml_tensor *) tensor)) {
GGML_ASSERT(tensor->ne[2] == 1);
GGML_ASSERT(tensor->ne[3] == 1);
weight_format_to_nz(tensor, offset, ctx->device);
if (ne0 % MATRIX_ROW_PADDING != 0) {
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
}
- } else if (weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor)) {
+ } else if (weight_to_nz && tensor->type != GGML_TYPE_BF16
+ && is_matmul_weight((const ggml_tensor *) tensor)) {
// NZ format weight are not support quantized yet.
// If ND tensor transform to NZ, size may changed.
int64_t shape[] = { tensor->ne[1], tensor->ne[0] };
case GGML_OP_MUL_MAT:
{
switch (op->src[0]->type) {
+#ifndef ASCEND_310P
+ case GGML_TYPE_BF16:
+#endif
case GGML_TYPE_F16:
case GGML_TYPE_F32:
return true;
switch (op->src[0]->type) {
case GGML_TYPE_F32:
case GGML_TYPE_F16:
+#ifndef ASCEND_310P
+ case GGML_TYPE_BF16:
+#endif
case GGML_TYPE_Q8_0:
return true;
default:
switch (op->type) {
case GGML_TYPE_F32:
case GGML_TYPE_F16:
+#ifndef ASCEND_310P
+ case GGML_TYPE_BF16:
+#endif
return true;
default:
return false;
case GGML_OP_CPY:
{
ggml_tensor * src = op->src[0];
+#ifdef ASCEND_310P
if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
(src->type != GGML_TYPE_F32 && src->type != GGML_TYPE_F16)) {
- // only support F32 and F16.
+ // only support F32 and F16 on 310P.
return false;
}
+#else
+ if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_BF16) ||
+ (src->type != GGML_TYPE_F32 && src->type != GGML_TYPE_F16 && src->type != GGML_TYPE_BF16)) {
+ // only support F32, F16 and BF16.
+ return false;
+ }
+#endif
return true;
}
break;
case GGML_OP_CONT:
{
- // TODO: support GGML_TYPE_BF16
switch (op->src[0]->type) {
case GGML_TYPE_F32:
case GGML_TYPE_F16:
+#ifndef ASCEND_310P
+ case GGML_TYPE_BF16:
+#endif
return true;
default:
return false;