llama : allow quantizing k-quants to fall back when tensor size incompatible (#3747)

author Kerfuffle <redacted>

Sat, 28 Oct 2023 11:54:24 +0000 (05:54 -0600)

committer GitHub <redacted>

Sat, 28 Oct 2023 11:54:24 +0000 (14:54 +0300)
author Kerfuffle <redacted>
Sat, 28 Oct 2023 11:54:24 +0000 (05:54 -0600)
committer GitHub <redacted>
Sat, 28 Oct 2023 11:54:24 +0000 (14:54 +0300)
diff --git a/llama.cpp b/llama.cpp

index 6caa58960cf3c31d07209473e302d976407bf645..3d431ee7bf526033695b5fa6b748ed4c15ac25cd 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -8049,6 +8049,24 @@ struct no_init {
      no_init() { /* do nothing */ }
  };
  
+struct quantize_state_internal {
+    const llama_model                 & model;
+    const llama_model_quantize_params * params;
+#ifdef GGML_USE_K_QUANTS
+    int n_attention_wv    = 0;
+    int n_feed_forward_w2 = 0;
+    int i_attention_wv    = 0;
+    int i_feed_forward_w2 = 0;
+
+    int n_k_quantized     = 0;
+    int n_fallback        = 0;
+#endif
+    quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
+        : model(model)
+        , params(params)
+        {}
+};
+
  static void llama_convert_tensor_internal(
      struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
      const size_t nelements, const int nthread
@@ -8109,12 +8127,13 @@ static void llama_convert_tensor_internal(
  
  #ifdef GGML_USE_K_QUANTS
  static ggml_type get_k_quant_type(
-    ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv,
-    int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
+    quantize_state_internal & qs,
+    ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype
  ) {
      const std::string name = ggml_get_name(tensor);
      // TODO: avoid hardcoded tensor names - use the TN_* constants
-    const auto tn = LLM_TN(model.arch);
+    const llm_arch arch = qs.model.arch;
+    const auto       tn = LLM_TN(arch);
  
      auto use_more_bits = [](int i_layer, int num_layers) -> bool {
          return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
@@ -8122,7 +8141,7 @@ static ggml_type get_k_quant_type(
  
      if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
          int nx = tensor->ne[0];
-        if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
+        if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
              new_type = GGML_TYPE_Q8_0;
          }
          else if (new_type != GGML_TYPE_Q8_0) {
@@ -8131,46 +8150,46 @@ static ggml_type get_k_quant_type(
      } else if (name.find("attn_v.weight") != std::string::npos) {
          if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
          else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
-            new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
+            new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
          }
          else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
          else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
-                use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
+                use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
          else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
-                (*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
-        if (model.type == MODEL_70B) {
+                (qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
+        if (qs.model.type == MODEL_70B) {
              // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
              // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
              // nearly negligible increase in model size by quantizing this tensor with more bits:
              if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
          }
-        ++*i_attention_wv;
+        ++qs.i_attention_wv;
      } else if (name.find("ffn_down.weight") != std::string::npos) {
          if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
          else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
-            new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
-                     : model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
+            new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
+                     : arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
                       : GGML_TYPE_Q3_K;
          }
          else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
-            new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
+            new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
          }
          else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
-            if (model.arch == LLM_ARCH_FALCON) {
-                new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
-                           use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
+            if (arch == LLM_ARCH_FALCON) {
+                new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
+                           use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
              } else {
-                if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
+                if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
              }
          }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) {
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4) {
              new_type = GGML_TYPE_Q5_K;
          }
-        ++*i_feed_forward_w2;
+        ++qs.i_feed_forward_w2;
      } else if (name.find("attn_output.weight") != std::string::npos) {
-        if (model.arch != LLM_ARCH_FALCON) {
+        if (arch != LLM_ARCH_FALCON) {
              if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K  ) new_type = GGML_TYPE_Q3_K;
              else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
              else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
@@ -8197,20 +8216,23 @@ static ggml_type get_k_quant_type(
          int nx = tensor->ne[0];
          int ny = tensor->ne[1];
          if (nx % QK_K != 0) {
-            LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
+            LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
              convert_incompatible_tensor = true;
+        } else {
+            ++qs.n_k_quantized;
          }
      }
      if (convert_incompatible_tensor) {
-        if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
-            new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
-            LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
-        } else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
-            new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
-            LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
-        } else {
-            throw std::runtime_error("Unsupported tensor size encountered\n");
+        switch (new_type) {
+            case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
+            case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
+            case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
+            case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
+            case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
+            default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
          }
+        LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
+        ++qs.n_fallback;
      }
  
      return new_type;
@@ -8268,6 +8290,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
      llm_load_arch(ml, model);
      llm_load_hparams(ml, model);
  
+    struct quantize_state_internal qs(model, params);
+
      if (params->only_copy) {
          ftype = model.ftype;
      }
@@ -8281,9 +8305,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
      gguf_set_val_u32(ctx_out, "general.file_type", ftype);
  
  #ifdef GGML_USE_K_QUANTS
-    int n_attention_wv    = 0;
-    int n_feed_forward_w2 = 0;
-
      for (int i = 0; i < ml.n_tensors; ++i) {
          struct ggml_tensor * meta = ml.get_tensor_meta(i);
  
@@ -8291,19 +8312,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
  
          // TODO: avoid hardcoded tensor names - use the TN_* constants
          if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
-            ++n_attention_wv;
+            ++qs.n_attention_wv;
          }
          else if (name.find("ffn_down.weight") != std::string::npos) {
-            ++n_feed_forward_w2;
+            ++qs.n_feed_forward_w2;
          }
      }
-    if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) {
+    if (qs.n_attention_wv != qs.n_feed_forward_w2 || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
          LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
-                __func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer);
+                __func__, qs.n_attention_wv, qs.n_feed_forward_w2, model.hparams.n_layer);
      }
-
-    int i_attention_wv = 0;
-    int i_feed_forward_w2 = 0;
  #endif
  
      size_t total_size_org = 0;
@@ -8370,9 +8388,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
          if (quantize) {
              new_type = quantized_type;
  #ifdef GGML_USE_K_QUANTS
-            new_type = get_k_quant_type(
-                new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
-            );
+            new_type = get_k_quant_type(qs, new_type, tensor, ftype);
  #endif
              // If we've decided to quantize to the same type the tensor is already
              // in then there's nothing to do.
@@ -8498,6 +8514,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
              LLAMA_LOG_INFO("\n");
          }
      }
+#ifdef GGML_USE_K_QUANTS
+    if (qs.n_fallback > 0) {
+        LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) incompatible with k-quants and required fallback quantization\n",
+                __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
+    }
+#endif
  }
  
  static int llama_apply_lora_from_file_internal(
author	Kerfuffle <redacted>
	Sat, 28 Oct 2023 11:54:24 +0000 (05:54 -0600)
committer	GitHub <redacted>
	Sat, 28 Oct 2023 11:54:24 +0000 (14:54 +0300)