ggml : add asserts for type conversion in fattn kernels (#9971)

author Georgi Gerganov <redacted>

Mon, 21 Oct 2024 13:20:46 +0000 (16:20 +0300)

committer GitHub <redacted>

Mon, 21 Oct 2024 13:20:46 +0000 (16:20 +0300)
author Georgi Gerganov <redacted>
Mon, 21 Oct 2024 13:20:46 +0000 (16:20 +0300)
committer GitHub <redacted>
Mon, 21 Oct 2024 13:20:46 +0000 (16:20 +0300)
diff --git a/common/common.cpp b/common/common.cpp

index 2bc0b8800a939b122b95a487dfa4b019dedee7e2..a8eebb68b5351becd1f144c6662ea9457c213007 100644 (file)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1035,7 +1035,7 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
          return GGML_TYPE_Q5_1;
      }
  
-    throw std::runtime_error("Invalid cache type: " + s);
+    throw std::runtime_error("Unsupported cache type: " + s);
  }
  
  struct llama_context_params common_context_params_to_llama(const common_params & params) {
@@ -1047,7 +1047,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
      cparams.n_ubatch          = params.n_ubatch;
      cparams.n_threads         = params.cpuparams.n_threads;
      cparams.n_threads_batch   = params.cpuparams_batch.n_threads == -1 ?
-                                    params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
+                                params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
      cparams.logits_all        = params.logits_all;
      cparams.embeddings        = params.embedding;
      cparams.rope_scaling_type = params.rope_scaling_type;
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c

index 7e24313edb269a3e070706f9942d9241cc21d3a3..b16c462fa1491b36400175dd769bc5368bfd9906 100644 (file)
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -324,8 +324,9 @@ struct ggml_logger_state {
  static struct ggml_logger_state g_logger_state = {ggml_log_callback_default, NULL};
  
  static void ggml_log_internal_v(enum ggml_log_level level, const char * format, va_list args) {
-    if (format == NULL)
+    if (format == NULL) {
          return;
+    }
      va_list args_copy;
      va_copy(args_copy, args);
      char buffer[128];
@@ -15723,6 +15724,9 @@ static void ggml_compute_forward_flash_attn_ext_f16(
      ggml_vec_dot_t    const kq_vec_dot     = type_traits[k->type].vec_dot;
      ggml_to_float_t   const v_to_float     = type_traits[v->type].to_float;
  
+    GGML_ASSERT(q_to_vec_dot && "fattn: unsupported K-type");
+    GGML_ASSERT(v_to_float   && "fattn: unsupported V-type");
+
      // loop over n_batch and n_head
      for (int ir = ir0; ir < ir1; ++ir) {
          // q indices
diff --git a/src/llama.cpp b/src/llama.cpp

index 1813dd29be2b26770f1267861bf3311a11e99181..98ec123c179b8a0cde38e9cc8b02e966277f5344 100644 (file)
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -19243,7 +19243,7 @@ struct llama_context * llama_new_context_with_model(
          params.flash_attn = false;
      }
  
-    if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
+    if (ggml_is_quantized(params.type_v) && !params.flash_attn) {
          LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
          return nullptr;
      }
author	Georgi Gerganov <redacted>
	Mon, 21 Oct 2024 13:20:46 +0000 (16:20 +0300)
committer	GitHub <redacted>
	Mon, 21 Oct 2024 13:20:46 +0000 (16:20 +0300)
common/common.cpp		patch \| blob \| history
ggml/src/ggml.c		patch \| blob \| history
src/llama.cpp		patch \| blob \| history