raise ValueError(f"Unprocessed experts: {experts}")
+@ModelBase.register("LlamaBidirectionalModel")
+class LlamaEmbedNemotronModel(LlamaModel):
+ model_arch = gguf.MODEL_ARCH.LLAMA_EMBED
+
+
@ModelBase.register("BailingMoeForCausalLM")
class BailingMoeModel(TextModel):
model_arch = gguf.MODEL_ARCH.BAILINGMOE
RND1 = auto()
PANGU_EMBED = auto()
MISTRAL3 = auto()
+ LLAMA_EMBED = auto()
class VISION_PROJECTOR_TYPE(IntEnum):
MODEL_ARCH.RND1: "rnd1",
MODEL_ARCH.PANGU_EMBED: "pangu-embedded",
MODEL_ARCH.MISTRAL3: "mistral3",
+ MODEL_ARCH.LLAMA_EMBED: "llama-embed",
}
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
],
+ MODEL_ARCH.LLAMA_EMBED: [
+ MODEL_TENSOR.TOKEN_EMBD,
+ MODEL_TENSOR.OUTPUT_NORM,
+ MODEL_TENSOR.OUTPUT,
+ MODEL_TENSOR.ROPE_FREQS,
+ MODEL_TENSOR.ATTN_NORM,
+ MODEL_TENSOR.ATTN_Q,
+ MODEL_TENSOR.ATTN_K,
+ MODEL_TENSOR.ATTN_V,
+ MODEL_TENSOR.ATTN_OUT,
+ MODEL_TENSOR.ATTN_ROT_EMBD,
+ MODEL_TENSOR.FFN_GATE_INP,
+ MODEL_TENSOR.FFN_NORM,
+ MODEL_TENSOR.FFN_GATE,
+ MODEL_TENSOR.FFN_DOWN,
+ MODEL_TENSOR.FFN_UP,
+ MODEL_TENSOR.FFN_GATE_EXP,
+ MODEL_TENSOR.FFN_DOWN_EXP,
+ MODEL_TENSOR.FFN_UP_EXP,
+ ]
# TODO
}
{ LLM_ARCH_RND1, "rnd1" },
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
{ LLM_ARCH_MISTRAL3, "mistral3" },
+ { LLM_ARCH_LLAMA_EMBED, "llama-embed" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
case LLM_ARCH_LLAMA:
case LLM_ARCH_DECI:
case LLM_ARCH_MISTRAL3:
+ case LLM_ARCH_LLAMA_EMBED:
return {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_OUTPUT_NORM,
LLM_ARCH_RND1,
LLM_ARCH_PANGU_EMBED,
LLM_ARCH_MISTRAL3,
+ LLM_ARCH_LLAMA_EMBED,
LLM_ARCH_UNKNOWN,
};
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
- if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
+ if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON || arch == LLM_ARCH_LLAMA_EMBED) {
if (hparams.n_rot != hparams.n_embd_head_k) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
}
// arch-specific KVs
switch (arch) {
case LLM_ARCH_LLAMA:
+ case LLM_ARCH_LLAMA_EMBED:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_MISTRAL3:
+ case LLM_ARCH_LLAMA_EMBED:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
switch (arch) {
case LLM_ARCH_LLAMA:
{
- llm = std::make_unique<llm_build_llama>(*this, params);
+ llm = std::make_unique<llm_build_llama<false>>(*this, params);
} break;
case LLM_ARCH_LLAMA4:
{
if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
- llm = std::make_unique<llm_build_llama>(*this, params);
+ llm = std::make_unique<llm_build_llama<false>>(*this, params);
} else {
llm = std::make_unique<llm_build_llama_iswa>(*this, params);
}
} break;
+ case LLM_ARCH_LLAMA_EMBED:
+ {
+ llm = std::make_unique<llm_build_llama<true>>(*this, params);
+ } break;
case LLM_ARCH_DECI:
{
llm = std::make_unique<llm_build_deci>(*this, params);
case LLM_ARCH_ERNIE4_5:
case LLM_ARCH_ERNIE4_5_MOE:
case LLM_ARCH_MISTRAL3:
+ case LLM_ARCH_LLAMA_EMBED:
return LLAMA_ROPE_TYPE_NORM;
// the pairs of head values are offset by n_rot/2
#include "models.h"
-llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+template <bool embed>
+llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv();
+ using inp_attn_type = std::conditional_t<embed, llm_graph_input_attn_no_cache, llm_graph_input_attn_kv>;
+
+ inp_attn_type * inp_attn = nullptr;
+ if constexpr (embed) {
+ inp_attn = build_attn_inp_no_cache();
+ } else {
+ inp_attn = build_attn_inp_kv();
+ }
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
cb(cur, "result_norm", -1);
res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
+ if constexpr (!embed) {
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+ }
ggml_build_forward_expand(gf, cur);
}
+
+template struct llm_build_llama<false>;
+template struct llm_build_llama<true>;
llm_build_llada_moe(const llama_model & model, const llm_graph_params & params);
};
+template <bool embed>
struct llm_build_llama : public llm_graph_context {
llm_build_llama(const llama_model & model, const llm_graph_params & params);
};