return GGML_TYPE_Q5_1;
}
- throw std::runtime_error("Invalid cache type: " + s);
+ throw std::runtime_error("Unsupported cache type: " + s);
}
struct llama_context_params common_context_params_to_llama(const common_params & params) {
cparams.n_ubatch = params.n_ubatch;
cparams.n_threads = params.cpuparams.n_threads;
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
- params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
+ params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
cparams.logits_all = params.logits_all;
cparams.embeddings = params.embedding;
cparams.rope_scaling_type = params.rope_scaling_type;
static struct ggml_logger_state g_logger_state = {ggml_log_callback_default, NULL};
static void ggml_log_internal_v(enum ggml_log_level level, const char * format, va_list args) {
- if (format == NULL)
+ if (format == NULL) {
return;
+ }
va_list args_copy;
va_copy(args_copy, args);
char buffer[128];
ggml_vec_dot_t const kq_vec_dot = type_traits[k->type].vec_dot;
ggml_to_float_t const v_to_float = type_traits[v->type].to_float;
+ GGML_ASSERT(q_to_vec_dot && "fattn: unsupported K-type");
+ GGML_ASSERT(v_to_float && "fattn: unsupported V-type");
+
// loop over n_batch and n_head
for (int ir = ir0; ir < ir1; ++ir) {
// q indices
params.flash_attn = false;
}
- if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
+ if (ggml_is_quantized(params.type_v) && !params.flash_attn) {
LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
return nullptr;
}