++qs.i_ffn_up;
}
- // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
- //}
- // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
- //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
- // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
- //}
- // This can be used to reduce the size of the Q5_K_S model.
- // The associated PPL increase is fully in line with the size reduction
- //else {
- // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
- //}
- bool convert_incompatible_tensor = false;
- {
- const int64_t nx = tensor->ne[0];
- const int64_t ny = tensor->ne[1];
- const int64_t qk_k = ggml_blck_size(new_type);
-
- if (nx % qk_k != 0) {
- LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
- convert_incompatible_tensor = true;
- } else {
- ++qs.n_k_quantized;
- }
- }
-
- if (convert_incompatible_tensor) {
- switch (new_type) {
- case GGML_TYPE_TQ1_0:
- case GGML_TYPE_TQ2_0: new_type = GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead
- case GGML_TYPE_IQ2_XXS:
- case GGML_TYPE_IQ2_XS:
- case GGML_TYPE_IQ2_S:
- case GGML_TYPE_IQ3_XXS:
- case GGML_TYPE_IQ3_S:
- case GGML_TYPE_IQ1_S:
- case GGML_TYPE_IQ1_M:
- case GGML_TYPE_Q2_K:
- case GGML_TYPE_Q3_K:
- case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
- case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
- case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
- case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
- default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
- }
- if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
- new_type = GGML_TYPE_F16;
- }
- LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
- ++qs.n_fallback;
- }
-
return new_type;
}
// get more optimal quantization type based on the tensor shape, layer, etc.
if (!params->pure && ggml_is_quantized(default_type)) {
- int fallback = qs.n_fallback;
- new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
- // unless the user specifies a type, and the tensor geometry will not require fallback quantisation
- if (params->tensor_types && qs.n_fallback - fallback == 0) {
+ // if the user provided tensor types - use those
+ bool manual = false;
+ if (params->tensor_types) {
const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
const std::string tensor_name(tensor->name);
for (const auto & [tname, qtype] : tensor_types) {
if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
if (qtype != new_type) {
- LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
+ LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(qtype));
new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
+ manual = true;
+ break;
}
}
}
}
+
+ // if not manual - use the standard logic for choosing the quantization type based on the selected mixture
+ if (!manual) {
+ new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
+ }
+
+ // incompatible tensor shapes are handled here - fallback to a compatible type
+ {
+ bool convert_incompatible_tensor = false;
+
+ const int64_t nx = tensor->ne[0];
+ const int64_t ny = tensor->ne[1];
+ const int64_t qk_k = ggml_blck_size(new_type);
+
+ if (nx % qk_k != 0) {
+ LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
+ convert_incompatible_tensor = true;
+ } else {
+ ++qs.n_k_quantized;
+ }
+
+ if (convert_incompatible_tensor) {
+ switch (new_type) {
+ case GGML_TYPE_TQ1_0:
+ case GGML_TYPE_TQ2_0: new_type = GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead
+ case GGML_TYPE_IQ2_XXS:
+ case GGML_TYPE_IQ2_XS:
+ case GGML_TYPE_IQ2_S:
+ case GGML_TYPE_IQ3_XXS:
+ case GGML_TYPE_IQ3_S:
+ case GGML_TYPE_IQ1_S:
+ case GGML_TYPE_IQ1_M:
+ case GGML_TYPE_Q2_K:
+ case GGML_TYPE_Q3_K:
+ case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
+ case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
+ case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
+ case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
+ default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
+ }
+ if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
+ new_type = GGML_TYPE_F16;
+ }
+ LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
+ ++qs.n_fallback;
+ }
+ }
}
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
new_type = params->token_embedding_type;