{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
+ { "Q3_K_XS",LLAMA_FTYPE_MOSTLY_Q3_K_XS,"3-bit extra small quantization" , },
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
+ case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
default: return "unknown, may not work";
}
const llama_model_quantize_params * params;
int n_attention_wv = 0;
- int n_feed_forward_w2 = 0;
+ int n_ffn_down = 0;
+ int n_ffn_gate = 0;
+ int n_ffn_up = 0;
int i_attention_wv = 0;
- int i_feed_forward_w2 = 0;
+ int i_ffn_down = 0;
+ int i_ffn_gate = 0;
+ int i_ffn_up = 0;
int n_k_quantized = 0;
int n_fallback = 0;
++qs.i_attention_wv;
}
else if (name.find("ffn_down") != std::string::npos) {
- if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q2_K;
- ++qs.i_feed_forward_w2;
+ if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
+ ++qs.i_ffn_down;
}
else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
} else if (name.find("attn_v.weight") != std::string::npos) {
// TODO: explore better strategies
new_type = GGML_TYPE_Q8_0;
}
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
+ new_type = GGML_TYPE_Q2_K;
+ }
} else if (name.find("ffn_down") != std::string::npos) {
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
int i_layer, n_layer;
if (n_expert == 1) {
- i_layer = qs.i_feed_forward_w2;
- n_layer = qs.n_feed_forward_w2;
+ i_layer = qs.i_ffn_down;
+ n_layer = qs.n_ffn_down;
} else {
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
- // sprinkled in the model. Hence, simply dividing i_feed_forward_w2 by n_expert does not work
+ // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
// for getting the current layer as I initially thought, and we need to resort to parsing the
// tensor name.
- n_layer = qs.n_feed_forward_w2 / n_expert;
+ n_layer = qs.n_ffn_down / n_expert;
if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) {
throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str()));
}
}
}
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
// same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
}
- ++qs.i_feed_forward_w2;
+ ++qs.i_ffn_down;
} else if (name.find("attn_output.weight") != std::string::npos) {
if (arch != LLM_ARCH_FALCON) {
if (qs.model.hparams.n_expert == 8) {
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS ||
+ ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
new_type = GGML_TYPE_Q5_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
}
+ else if (name.find("ffn_gate") != std::string::npos) {
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(qs.i_ffn_gate, qs.n_ffn_gate)) {
+ new_type = GGML_TYPE_Q2_K;
+ }
+ ++qs.i_ffn_gate;
+ }
+ else if (name.find("ffn_up") != std::string::npos) {
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(qs.i_ffn_up, qs.n_ffn_up)) {
+ new_type = GGML_TYPE_Q2_K;
+ }
+ ++qs.i_ffn_up;
+ }
+ // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
+ //}
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
//else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
// K-quants
+ case LLAMA_FTYPE_MOSTLY_Q2_K_S:
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
- case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break;
+ case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
++qs.n_attention_wv;
}
else if (name.find("ffn_down") != std::string::npos) {
- ++qs.n_feed_forward_w2;
+ ++qs.n_ffn_down;
+ }
+ else if (name.find("ffn_gate") != std::string::npos) {
+ ++qs.n_ffn_gate;
+ }
+ else if (name.find("ffn_up") != std::string::npos) {
+ ++qs.n_ffn_up;
}
}
- if (qs.n_attention_wv != qs.n_feed_forward_w2 || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
- LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
- __func__, qs.n_attention_wv, qs.n_feed_forward_w2, model.hparams.n_layer);
+ if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
+ LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n",
+ __func__, qs.n_attention_wv, qs.n_ffn_down, model.hparams.n_layer);
}
size_t total_size_org = 0;