GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
const void* src,
void* dst) {
- GGML_ASSERT(tensor->op == GGML_OP_NONE);
int64_t n_elems = ggml_nelements(tensor);
int64_t groups = n_elems / QK4_0;
*/
GGML_CALL static void ggml_backend_cann_transform_back_q4_0(
const ggml_tensor* tensor, void* src, void* dst) {
- GGML_ASSERT(tensor->op == GGML_OP_NONE);
int64_t n_elems = ggml_nelements(tensor);
int64_t groups = n_elems / QK4_0;
}
case GGML_OP_MUL_MAT: {
switch (op->src[0]->type) {
- // case GGML_TYPE_Q4_0:
case GGML_TYPE_F16:
case GGML_TYPE_F32:
case GGML_TYPE_Q8_0:
+ // TODO: fix me
+ // Current groupsize should not be greater than k-1 in
+ // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize().
+ if (op->src[0]->ne[0]-1 > QK8_0) {
+ return true;
+ }
+ return false;
+ case GGML_TYPE_Q4_0:
return true;
default:
return false;
case GGML_TYPE_F32:
case GGML_TYPE_F16:
case GGML_TYPE_Q8_0:
+ case GGML_TYPE_Q4_0:
return true;
default:
return false;