gguf.MODEL_TENSOR.POS_EMBD,
gguf.MODEL_TENSOR.TOKEN_TYPES,
gguf.MODEL_TENSOR.SSM_CONV1D,
+ gguf.MODEL_TENSOR.SHORTCONV_CONV,
gguf.MODEL_TENSOR.TIME_MIX_FIRST,
gguf.MODEL_TENSOR.TIME_MIX_W1,
gguf.MODEL_TENSOR.TIME_MIX_W2,
if chkhsh == "f6791d196f87ce6b56a7d234be618e0d58f8cda3549416635b2bebcd22cd95c4":
# ref: https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct
res = "midm-2.0"
+ if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51":
+ # ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
+ res = "lfm2"
if res is None:
logger.warning("\n")
chat_template = tokenizer.chat_template.replace("[:]", "")
self.gguf_writer.add_chat_template(chat_template)
+
+@ModelBase.register("Lfm2ForCausalLM")
+@ModelBase.register("LFM2ForCausalLM")
+class LFM2Model(TextModel):
+ model_arch = gguf.MODEL_ARCH.LFM2
+
+ def _add_feed_forward_length(self):
+ ff_dim = self.hparams["block_ff_dim"]
+
+ auto_adjust_ff_dim = self.hparams["block_auto_adjust_ff_dim"]
+ ff_dim = self.hparams["block_ff_dim"]
+ ffn_dim_multiplier = self.hparams["block_ffn_dim_multiplier"]
+ multiple_of = self.hparams["block_multiple_of"]
+
+ if auto_adjust_ff_dim:
+ ff_dim = int(2 * ff_dim / 3)
+ # custom dim factor multiplier
+ if ffn_dim_multiplier is not None:
+ ff_dim = int(ffn_dim_multiplier * ff_dim)
+ ff_dim = multiple_of * ((ff_dim + multiple_of - 1) // multiple_of)
+
+ self.gguf_writer.add_feed_forward_length(ff_dim)
+
+ def set_gguf_parameters(self):
+ # set num_key_value_heads only for attention layers
+ self.hparams["num_key_value_heads"] = [
+ self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0
+ for layer_type in self.hparams["layer_types"]
+ ]
+
+ super().set_gguf_parameters()
+ self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
+ self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"])
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams["norm_eps"])
+ self._add_feed_forward_length()
+
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+ # conv op requires 2d tensor
+ if 'conv.conv' in name:
+ data_torch = data_torch.squeeze(1)
+
+ return [(self.map_tensor_name(name), data_torch)]
+
+
###### CONVERSION LOGIC ######
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
{"name": "a.x-4.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", },
{"name": "midm-2.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
+ {"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
]
# some models are known to be broken upstream, so we will skip them as exceptions
if (nc == 4) {
ssm_conv_f32<threads, 4><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
dst, dst_nb0, dst_nb1, dst_nb2, n_t);
+ } else if (nc == 3) {
+ ssm_conv_f32<threads, 3><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
+ dst, dst_nb0, dst_nb1, dst_nb2, n_t);
} else {
- GGML_ABORT("Only support kernel size = 4 now.");
+ GGML_ABORT("Only support kernel size = 3 or size = 4 right now.");
}
} else {
if (nc == 4) {
dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
ssm_conv_long_token_f32<threads, 4, split_n_t><<<blocks, threads, 0, stream>>>(
src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
+ } else if (nc == 3) {
+ const int64_t split_n_t = 32;
+ dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
+ ssm_conv_long_token_f32<threads, 3, split_n_t><<<blocks, threads, 0, stream>>>(
+ src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
} else {
- GGML_ABORT("Only support kernel size = 4 right now.");
+ GGML_ABORT("Only support kernel size = 3 or size = 4 right now.");
}
}
}
class Classifier:
OUTPUT_LABELS = "{arch}.classifier.output_labels"
+ class ShortConv:
+ L_CACHE = "{arch}.shortconv.l_cache"
+
class Tokenizer:
MODEL = "tokenizer.ggml.model"
PRE = "tokenizer.ggml.pre"
ERNIE4_5 = auto()
HUNYUAN_MOE = auto()
SMOLLM3 = auto()
+ LFM2 = auto()
class VISION_PROJECTOR_TYPE(IntEnum):
POSNET_ATTN_K = auto()
POSNET_ATTN_V = auto()
POSNET_ATTN_OUT = auto()
+ SHORTCONV_CONV = auto()
+ SHORTCONV_INPROJ = auto()
+ SHORTCONV_OUTPROJ = auto()
# vision
V_MMPROJ = auto()
V_MMPROJ_FC = auto()
MODEL_ARCH.FALCON_H1: "falcon-h1",
MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe",
MODEL_ARCH.SMOLLM3: "smollm3",
+ MODEL_ARCH.LFM2: "lfm2",
}
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k",
MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v",
MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output",
+ MODEL_TENSOR.SHORTCONV_CONV: "blk.{bid}.shortconv.conv",
+ MODEL_TENSOR.SHORTCONV_INPROJ: "blk.{bid}.shortconv.in_proj",
+ MODEL_TENSOR.SHORTCONV_OUTPROJ: "blk.{bid}.shortconv.out_proj",
# vision
MODEL_TENSOR.V_MMPROJ: "mm.{bid}",
MODEL_TENSOR.V_MMPROJ_FC: "mm.model.fc",
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
+ MODEL_ARCH.LFM2: [
+ MODEL_TENSOR.TOKEN_EMBD,
+ MODEL_TENSOR.TOKEN_EMBD_NORM,
+ MODEL_TENSOR.SHORTCONV_CONV,
+ MODEL_TENSOR.SHORTCONV_INPROJ,
+ MODEL_TENSOR.SHORTCONV_OUTPROJ,
+ MODEL_TENSOR.FFN_GATE,
+ MODEL_TENSOR.FFN_DOWN,
+ MODEL_TENSOR.FFN_UP,
+ MODEL_TENSOR.FFN_NORM,
+ MODEL_TENSOR.ATTN_NORM, # operator_norm
+ MODEL_TENSOR.ATTN_Q_NORM,
+ MODEL_TENSOR.ATTN_K_NORM,
+ MODEL_TENSOR.ATTN_Q,
+ MODEL_TENSOR.ATTN_K,
+ MODEL_TENSOR.ATTN_V,
+ MODEL_TENSOR.ATTN_OUT,
+ ],
# TODO
}
def add_convnext_block_count(self, length: int) -> None:
self.add_uint32(Keys.ConvNext.BLOCK_COUNT.format(arch=self.arch), length)
+ def add_shortconv_l_cache(self, length: int) -> None:
+ self.add_uint32(Keys.ShortConv.L_CACHE.format(arch=self.arch), length)
+
def add_block_count(self, length: int) -> None:
self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length)
"model.pre_ln", # rwkv7
"model.layers.0.pre_norm", # rwkv7
"backbone.norm", # wavtokenizer
+ "model.embedding_norm", # lfm2
),
# Position embeddings
"model.layers.{bid}.ln1", # rwkv7
"model.layers.{bid}.input_layernorm", # llama4
"transformer_encoder.{bid}.attention_norm", # neobert
+ "model.layers.{bid}.operator_norm", # lfm2
),
# Attention norm 2
"transformer.h.{bid}.self_attention.dense", # falcon
"h.{bid}.self_attention.dense", # bloom
"model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2 phimoe
+ "model.layers.{bid}.self_attn.out_proj", # lfm2
"model.layers.{bid}.self_attn.linear_attn", # deci
"layers.{bid}.attention.wo", # llama-pth
"encoder.layer.{bid}.attention.output.dense", # bert
"backbone.posnet.{bid}.proj_out", # wavtokenizer
),
+ MODEL_TENSOR.SHORTCONV_CONV: (
+ "model.layers.{bid}.conv.conv",
+ ),
+
+ MODEL_TENSOR.SHORTCONV_INPROJ: (
+ "model.layers.{bid}.conv.in_proj",
+ ),
+
+ MODEL_TENSOR.SHORTCONV_OUTPROJ: (
+ "model.layers.{bid}.conv.out_proj",
+ ),
+
#############################################################################
## Vision encoder
{ LLM_ARCH_ERNIE4_5, "ernie4_5" },
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
{ LLM_ARCH_SMOLLM3, "smollm3" },
+ { LLM_ARCH_LFM2, "lfm2" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
{ LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
+ { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
+
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
+ {
+ LLM_ARCH_LFM2,
+ {
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ { LLM_TENSOR_SHORTCONV_CONV, "blk.%d.shortconv.conv" },
+ { LLM_TENSOR_SHORTCONV_INPROJ, "blk.%d.shortconv.in_proj" },
+ { LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
+ }
+ },
{
LLM_ARCH_UNKNOWN,
{
{LLM_TENSOR_CONVNEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_CONVNEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
+ {LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
};
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
case LLM_ARCH_JAMBA:
case LLM_ARCH_FALCON_H1:
case LLM_ARCH_GRANITE_HYBRID:
+ case LLM_ARCH_LFM2:
return true;
default:
return false;
LLM_ARCH_ERNIE4_5,
LLM_ARCH_HUNYUAN_MOE,
LLM_ARCH_SMOLLM3,
+ LLM_ARCH_LFM2,
LLM_ARCH_UNKNOWN,
};
LLM_KV_CLASSIFIER_OUTPUT_LABELS,
+ LLM_KV_SHORTCONV_L_CACHE,
+
// deprecated:
LLM_KV_TOKENIZER_PREFIX_ID,
LLM_KV_TOKENIZER_SUFFIX_ID,
LLM_TENSOR_POS_NET_ATTN_K,
LLM_TENSOR_POS_NET_ATTN_V,
LLM_TENSOR_POS_NET_ATTN_OUT,
+ LLM_TENSOR_SHORTCONV_CONV,
+ LLM_TENSOR_SHORTCONV_INPROJ,
+ LLM_TENSOR_SHORTCONV_OUTPROJ,
};
enum llm_tensor_layer {
return token_shift_count * n_embd;
}
+ if (n_shortconv_l_cache != 0) {
+ // for LFM2 models
+ return n_embd * (n_shortconv_l_cache - 1);
+ }
+
// TODO: maybe support other convolution strides than 1
// NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
// Corresponds to Mamba's conv_states size
struct llama_hparams_posnet posnet;
struct llama_hparams_convnext convnext;
+ uint32_t n_shortconv_l_cache = 0;
+
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
case LLM_TYPE_256M: return "256M";
case LLM_TYPE_270M: return "270M";
case LLM_TYPE_335M: return "335M";
+ case LLM_TYPE_350M: return "350M";
case LLM_TYPE_410M: return "410M";
case LLM_TYPE_450M: return "450M";
case LLM_TYPE_475M: return "475M";
+ case LLM_TYPE_700M: return "700M";
case LLM_TYPE_770M: return "770M";
case LLM_TYPE_780M: return "780M";
case LLM_TYPE_0_3B: return "0.3B";
case LLM_TYPE_0_5B: return "0.5B";
case LLM_TYPE_0_6B: return "0.6B";
case LLM_TYPE_1B: return "1B";
+ case LLM_TYPE_1_2B: return "1.2B";
case LLM_TYPE_1_3B: return "1.3B";
case LLM_TYPE_1_4B: return "1.4B";
case LLM_TYPE_1_5B: return "1.5B";
default: type = LLM_TYPE_UNKNOWN;
}
} break;
+ case LLM_ARCH_LFM2:
+ {
+ ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ for (uint32_t il = 0; il < hparams.n_layer; ++il) {
+ hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
+ }
+ switch (hparams.n_embd) {
+ case 1024: type = LLM_TYPE_350M; break;
+ case 1536: type = LLM_TYPE_700M; break;
+ case 2048: type = LLM_TYPE_1_2B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
default: throw std::runtime_error("unsupported model architecture");
}
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
}
} break;
+ case LLM_ARCH_LFM2:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+ tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+ // ffn is same for transformer and conv layers
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+
+ // for operator_norm
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ if (!hparams.is_recurrent(i)) {
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+ GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, hparams.n_embd_k_gqa(i)}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, hparams.n_embd_v_gqa(i)}, 0);
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ } else {
+ layer.shortconv.conv = create_tensor(tn(LLM_TENSOR_SHORTCONV_CONV, "weight", i), {hparams.n_shortconv_l_cache, n_embd}, 0);
+ layer.shortconv.in_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_INPROJ, "weight", i), {n_embd, 3 * n_embd}, 0);
+ layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
+ }
+ }
+ } break;
default:
throw std::runtime_error("unknown architecture");
}
}
};
+struct llm_build_lfm2 : public llm_graph_context {
+ const llama_model & model;
+
+ llm_build_lfm2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params), model(model) {
+
+ ggml_tensor * cur = build_inp_embd(model.tok_embd);
+ cb(cur, "model.embed_tokens", -1);
+
+ ggml_tensor * inp_pos = build_inp_pos();
+ auto * inp_hybrid = build_inp_mem_hybrid();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ auto * prev_cur = cur;
+ cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "model.layers.{}.operator_norm", il);
+
+ cur = hparams.is_recurrent(il) ?
+ build_shortconv_block(gf, cur, inp_hybrid->get_recr(), il) :
+ build_attn_block(gf, cur, inp_pos, inp_hybrid->get_attn(), il) ;
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ prev_cur = ggml_get_rows(ctx0, prev_cur, inp_out_ids);
+ }
+
+ cur = ggml_add(ctx0, prev_cur, cur);
+ cur = ggml_add(ctx0, cur, build_feed_forward(cur, il));
+ }
+
+ cur = build_norm(cur, model.tok_norm, NULL, LLM_NORM_RMS, -1);
+ cb(cur, "model.embedding_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head is tied with embeddings
+ cur = build_lora_mm(model.tok_embd, cur);
+ cb(cur, "lm_head", -1);
+
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+ }
+
+ ggml_tensor * build_feed_forward(ggml_tensor * cur,
+ int il) const {
+ cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "model.layers.{}.ffn_norm", il);
+
+ GGML_ASSERT(!model.layers[il].ffn_up_b);
+ GGML_ASSERT(!model.layers[il].ffn_gate_b);
+ GGML_ASSERT(!model.layers[il].ffn_down_b);
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "model.layers.{}.feed_forward.w2", il);
+
+ return cur;
+ }
+
+ ggml_tensor * build_attn_block(ggml_cgraph * gf,
+ ggml_tensor * cur,
+ ggml_tensor * inp_pos,
+ llm_graph_input_attn_kv_unified * inp_attn,
+ int il) const {
+ GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
+ auto const n_embd_head = hparams.n_embd_head_v;
+ auto const n_head_kv = hparams.n_head_kv(il);
+
+ auto * q = build_lora_mm(model.layers[il].wq, cur);
+ cb(q, "model.layers.{}.self_attn.q_proj", il);
+ auto * k = build_lora_mm(model.layers[il].wk, cur);
+ cb(k, "model.layers.{}.self_attn.k_proj", il);
+ auto * v = build_lora_mm(model.layers[il].wv, cur);
+ cb(v, "model.layers.{}.self_attn.v_proj", il);
+
+ q = ggml_reshape_3d(ctx0, q, n_embd_head, n_head, n_tokens);
+ k = ggml_reshape_3d(ctx0, k, n_embd_head, n_head_kv, n_tokens);
+ v = ggml_reshape_3d(ctx0, v, n_embd_head, n_head_kv, n_tokens);
+
+ // qk norm
+ q = build_norm(q, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(q, "model.layers.{}.self_attn.q_layernorm", il);
+ k = build_norm(k, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(k, "model.layers.{}.self_attn.k_layernorm", il);
+
+ // RoPE
+ q = ggml_rope_ext(
+ ctx0, q, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+ k = ggml_rope_ext(
+ ctx0, k, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL,
+ q, k, v, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+
+ cb(cur, "model.layers.{}.self_attn.out_proj", il);
+
+ return cur;
+ }
+
+ ggml_tensor * build_shortconv_block(ggml_cgraph * gf,
+ ggml_tensor * cur,
+ llm_graph_input_rs * inp_recr,
+ int il) {
+ const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx)->get_recr();
+
+ auto * bcx = build_lora_mm(model.layers[il].shortconv.in_proj, cur);
+ cb(bcx, "model.layers.{}.conv.in_proj", il);
+
+ constexpr auto n_chunks = 3;
+ GGML_ASSERT(bcx->ne[0] % n_chunks == 0);
+ auto const chunk_size = bcx->ne[0] / n_chunks;
+ auto * b = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 0 * chunk_size * ggml_element_size(bcx));
+ auto * c = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 1 * chunk_size * ggml_element_size(bcx));
+ auto * x = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 2 * chunk_size * ggml_element_size(bcx));
+
+ auto * bx = ggml_transpose(ctx0, ggml_mul(ctx0, b, x));
+
+ // read conv state directly, with build_rs generation is slower
+ ggml_tensor * conv_state = mctx_cur->get_r_l(il);
+ const int64_t n_seqs = ubatch.n_seqs;
+ ggml_tensor * conv = build_rs(inp_recr, gf, conv_state, hparams.n_embd_r(), n_seqs);
+ conv = ggml_reshape_3d(ctx0, conv_state, hparams.n_shortconv_l_cache - 1, hparams.n_embd, n_seqs);
+
+ bx = ggml_concat(ctx0, conv, bx, 0);
+ GGML_ASSERT(bx->ne[0] > conv->ne[0]);
+
+ auto * new_conv = ggml_view_2d(ctx0, bx, conv->ne[0], bx->ne[1], bx->nb[1], (bx->ne[0] - conv->ne[0]) * ggml_element_size(bx));
+ GGML_ASSERT(ggml_are_same_shape(conv, new_conv));
+
+ // write conv state
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, new_conv, conv_state));
+
+ auto * conv_kernel = model.layers[il].shortconv.conv;
+ GGML_ASSERT(hparams.n_shortconv_l_cache > 0);
+
+ // construct ssm_conv op
+ ggml_tensor * conv_out = ggml_ssm_conv(ctx0, bx, conv_kernel);
+ cb(conv_out, "model.layers.{}.conv.conv", il);
+
+ auto * y = ggml_mul(ctx0, c, conv_out);
+
+ y = build_lora_mm(model.layers[il].shortconv.out_proj, y);
+ cb(y, "model.layers.{}.conv.out_proj", il);
+
+ return y;
+ }
+};
+
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
llama_memory_i * res;
{
llm = std::make_unique<llm_build_falcon_h1>(*this, params, gf);
} break;
+ case LLM_ARCH_LFM2:
+ {
+ llm = std::make_unique<llm_build_lfm2>(*this, params, gf);
+ } break;
default:
GGML_ABORT("fatal error");
}
case LLM_ARCH_MINICPM3:
case LLM_ARCH_DOTS1:
case LLM_ARCH_HUNYUAN_MOE:
+ case LLM_ARCH_LFM2:
return LLAMA_ROPE_TYPE_NEOX;
case LLM_ARCH_QWEN2VL:
LLM_TYPE_256M,
LLM_TYPE_270M,
LLM_TYPE_335M,
+ LLM_TYPE_350M,
LLM_TYPE_410M,
LLM_TYPE_450M,
LLM_TYPE_475M,
+ LLM_TYPE_700M,
LLM_TYPE_770M,
LLM_TYPE_780M,
LLM_TYPE_0_3B,
LLM_TYPE_0_5B,
LLM_TYPE_0_6B,
LLM_TYPE_1B,
+ LLM_TYPE_1_2B,
LLM_TYPE_1_3B,
LLM_TYPE_1_4B,
LLM_TYPE_1_5B,
struct ggml_tensor * gamma = nullptr;
};
+struct llama_layer_shortconv {
+ struct ggml_tensor * in_proj = nullptr;
+ struct ggml_tensor * conv = nullptr;
+ struct ggml_tensor * out_proj = nullptr;
+};
+
struct llama_layer {
// normalization
struct ggml_tensor * attn_norm = nullptr;
struct llama_layer_posnet posnet;
struct llama_layer_convnext convnext;
+
+ struct llama_layer_shortconv shortconv;
};
struct llama_model {
// do not quantize Mamba's small yet 2D weights
// NOTE: can't use LLM_TN here because the layer number is not known
quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
+ quantize &= name.find("shortconv.conv.weight") == std::string::npos;
// do not quantize RWKV's small yet 2D weights
quantize &= name.find("time_mix_first.weight") == std::string::npos;
tokenizer_pre == "falcon3" ||
tokenizer_pre == "falcon-h1" ||
tokenizer_pre == "pixtral" ||
- tokenizer_pre == "midm-2.0") {
+ tokenizer_pre == "midm-2.0" ||
+ tokenizer_pre == "lfm2") {
pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
ignore_merges = true;
add_bos = true;