printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
} else {
new_type = quantized_type;
- if (tensor.name == "output.weight") new_type = GGML_TYPE_Q6_K;
- else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
+ // TODO: temporary disabled until Metal / OpenCL support is available
+ // ref: https://github.com/ggerganov/llama.cpp/issues/1711
+ //if (tensor.name == "output.weight") {
+ // new_type = GGML_TYPE_Q6_K;
+ //}
+ if (tensor.name.find("attention.wv.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
(i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
++i_attention_wv;
}
- else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
+ if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
(i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
++i_feed_forward_w2;
}
- else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
+ if (tensor.name.find("attention.wo.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
}
+
float * f32_data;
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
llama_buffer f32_conv_buf;