llama : temporary disable Q6_K output quantization (#1711)

author Georgi Gerganov <redacted>

Tue, 6 Jun 2023 06:39:38 +0000 (09:39 +0300)

committer Georgi Gerganov <redacted>

Tue, 6 Jun 2023 06:39:38 +0000 (09:39 +0300)
author Georgi Gerganov <redacted>
Tue, 6 Jun 2023 06:39:38 +0000 (09:39 +0300)
committer Georgi Gerganov <redacted>
Tue, 6 Jun 2023 06:39:38 +0000 (09:39 +0300)
diff --git a/llama.cpp b/llama.cpp

index 568ce6acaab9c3fde6811a07f847b4733f788231..70341d04f91b0a255014960c173e63d50da63b28 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -2198,8 +2198,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
              printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
          } else {
              new_type = quantized_type;
-            if (tensor.name == "output.weight") new_type = GGML_TYPE_Q6_K;
-            else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
+            // TODO: temporary disabled until Metal / OpenCL support is available
+            //       ref: https://github.com/ggerganov/llama.cpp/issues/1711
+            //if (tensor.name == "output.weight") {
+            //    new_type = GGML_TYPE_Q6_K;
+            //}
+            if (tensor.name.find("attention.wv.weight") != std::string::npos) {
                  if      (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
                  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
                  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
@@ -2207,7 +2211,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                           (i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
                  ++i_attention_wv;
              }
-            else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
+            if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
                  if      (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
                  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
                  else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
@@ -2215,10 +2219,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                           (i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
                  ++i_feed_forward_w2;
              }
-            else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
+            if (tensor.name.find("attention.wo.weight") != std::string::npos) {
                  if      (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
                  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
              }
+
              float * f32_data;
              size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
              llama_buffer f32_conv_buf;
author	Georgi Gerganov <redacted>
	Tue, 6 Jun 2023 06:39:38 +0000 (09:39 +0300)
committer	Georgi Gerganov <redacted>
	Tue, 6 Jun 2023 06:39:38 +0000 (09:39 +0300)