From: Georgi Gerganov Date: Tue, 6 Jun 2023 06:39:38 +0000 (+0300) Subject: llama : temporary disable Q6_K output quantization (#1711) X-Git-Tag: gguf-v0.4.0~682 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=7a74dee6b4e0e80862191141c0037abe28967d5c;p=pkg%2Fggml%2Fsources%2Fllama.cpp llama : temporary disable Q6_K output quantization (#1711) --- diff --git a/llama.cpp b/llama.cpp index 568ce6ac..70341d04 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2198,8 +2198,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0); } else { new_type = quantized_type; - if (tensor.name == "output.weight") new_type = GGML_TYPE_Q6_K; - else if (tensor.name.find("attention.wv.weight") != std::string::npos) { + // TODO: temporary disabled until Metal / OpenCL support is available + // ref: https://github.com/ggerganov/llama.cpp/issues/1711 + //if (tensor.name == "output.weight") { + // new_type = GGML_TYPE_Q6_K; + //} + if (tensor.name.find("attention.wv.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && @@ -2207,7 +2211,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s (i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K; ++i_attention_wv; } - else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) { + if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && @@ -2215,10 +2219,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s (i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K; ++i_feed_forward_w2; } - else if (tensor.name.find("attention.wo.weight") != std::string::npos) { + if (tensor.name.find("attention.wo.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; } + float * f32_data; size_t nelements = tensor.ne.at(0) * tensor.ne.at(1); llama_buffer f32_conv_buf;