#include "llama-adapter.h"
+#include "llama-impl.h"
+#include "llama-mmap.h"
#include "llama-model.h"
#include <algorithm>
// vec
-struct ggml_tensor * llama_control_vector::tensor_for(int il) const {
+struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
return nullptr;
}
return tensors[il];
}
-struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
+struct ggml_tensor * llama_adapter_cvec::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
ggml_tensor * layer_dir = tensor_for(il);
if (layer_dir != nullptr) {
cur = ggml_add(ctx, cur, layer_dir);
return cur;
}
-static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
+bool llama_adapter_cvec::init(const llama_model & model) {
const auto & hparams = model.hparams;
- GGML_ASSERT(cvec.tensors.empty());
- GGML_ASSERT(cvec.ctxs.empty());
- GGML_ASSERT(cvec.bufs.empty());
+ GGML_ASSERT(tensors.empty());
+ GGML_ASSERT(ctxs.empty());
+ GGML_ASSERT(bufs.empty());
// create a context for each buffer type
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
}
ctx_map[buft] = ctx;
- cvec.ctxs.emplace_back(ctx);
+ ctxs.emplace_back(ctx);
return ctx;
}
};
// make tensors
- cvec.tensors.reserve(hparams.n_layer);
- cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
+ tensors.reserve(hparams.n_layer);
+ tensors.push_back(nullptr); // there's never a tensor for layer 0
for (size_t il = 1; il < hparams.n_layer; il++) {
- ggml_backend_buffer_type_t buft = llama_model_select_buft(model, il);
+ ggml_backend_buffer_type_t buft = model.select_buft(il);
ggml_context * ctx = ctx_for_buft(buft);
if (!ctx) {
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
return false;
}
ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
- cvec.tensors.push_back(tensor);
+ tensors.push_back(tensor);
}
// allocate tensors / buffers and zero
- cvec.bufs.reserve(ctx_map.size());
+ bufs.reserve(ctx_map.size());
for (auto it : ctx_map) {
ggml_backend_buffer_type_t buft = it.first;
ggml_context * ctx = it.second;
return false;
}
ggml_backend_buffer_clear(buf, 0);
- cvec.bufs.emplace_back(buf);
+ bufs.emplace_back(buf);
}
return true;
}
-int32_t llama_control_vector_apply(
- struct llama_control_vector & cvec,
+int32_t llama_adapter_cvec::apply(
const llama_model & model,
const float * data,
size_t len,
if (data == nullptr) {
// disable the current control vector (but leave allocated for later)
- cvec.layer_start = -1;
- cvec.layer_end = -1;
+ layer_start = -1;
+ layer_end = -1;
return 0;
}
return 1;
}
- if (cvec.tensors.empty()) {
- if (!llama_control_vector_init(cvec, model)) {
+ if (tensors.empty()) {
+ if (!init(model)) {
return 1;
}
}
- cvec.layer_start = il_start;
- cvec.layer_end = il_end;
+ layer_start = il_start;
+ layer_end = il_end;
for (size_t il = 1; il < hparams.n_layer; il++) {
- assert(cvec.tensors[il] != nullptr);
+ assert(tensors[il] != nullptr);
const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
if (off + n_embd <= len) {
- ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
+ ggml_backend_tensor_set(tensors[il], data + off, 0, n_embd * ggml_element_size(tensors[il]));
}
}
// lora
-llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) {
+llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor * w) {
const std::string name(w->name);
const auto pos = ab_map.find(name);
return nullptr;
}
-void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
- delete adapter;
-}
-
-static void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) {
+static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
ggml_context * ctx_init;
};
// bundle lora_a and lora_b into pairs
- std::map<std::string, llama_lora_weight> ab_map;
+ std::map<std::string, llama_adapter_lora_weight> ab_map;
auto str_endswith = [](const std::string & str, const std::string & suffix) {
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
};
if (str_endswith(name, ".lora_a")) {
replace_all(name, ".lora_a", "");
if (ab_map.find(name) == ab_map.end()) {
- ab_map[name] = llama_lora_weight(cur, nullptr);
+ ab_map[name] = llama_adapter_lora_weight(cur, nullptr);
} else {
ab_map[name].a = cur;
}
} else if (str_endswith(name, ".lora_b")) {
replace_all(name, ".lora_b", "");
if (ab_map.find(name) == ab_map.end()) {
- ab_map[name] = llama_lora_weight(nullptr, cur);
+ ab_map[name] = llama_adapter_lora_weight(nullptr, cur);
} else {
ab_map[name].b = cur;
}
+ } else if (str_endswith(name, "_norm.weight")) {
+ // TODO: add support for norm vector
+ // for now, we don't really care because most adapters still work fine without it
+ continue;
} else {
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
}
// add tensors
for (auto & it : ab_map) {
const std::string & name = it.first;
- llama_lora_weight & w = it.second;
+ llama_adapter_lora_weight & w = it.second;
+ bool is_token_embd = str_endswith(name, "token_embd.weight");
if (!w.a || !w.b) {
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
}
// device buft and device ctx
- auto * model_tensor = llama_model_get_tensor(model, name.c_str());
+ const auto * model_tensor = model.get_tensor(name.c_str());
if (!model_tensor) {
- throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
+ throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
}
struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
// validate tensor shape
- if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
- throw std::runtime_error("tensor '" + name + "' has incorrect shape");
- }
- if (w.a->ne[1] != w.b->ne[0]) {
- throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
+ if (is_token_embd) {
+ // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
+ if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
+ throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
+ }
+ } else {
+ if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
+ throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
+ }
+ if (w.a->ne[1] != w.b->ne[0]) {
+ throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
+ }
}
// save tensor to adapter
struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
ggml_set_name(tensor_a, w.a->name);
ggml_set_name(tensor_b, w.b->name);
- adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b);
+ adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
}
// allocate tensors / buffers and zero
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
}
-struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) {
- struct llama_lora_adapter * adapter = new llama_lora_adapter();
+struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) {
+ struct llama_adapter_lora * adapter = new llama_adapter_lora();
try {
- llama_lora_adapter_init_impl(*model, path_lora, *adapter);
+ llama_adapter_lora_init_impl(*model, path_lora, *adapter);
return adapter;
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
return nullptr;
}
+
+void llama_adapter_lora_free(struct llama_adapter_lora * adapter) {
+ delete adapter;
+}
#pragma once
-#include "llama-impl.h"
-#include "llama-hparams.h"
+#include "llama.h"
#include "ggml-cpp.h"
+#include <string>
#include <unordered_map>
#include <vector>
+// TODO: pimpl
+
//
// llama_adapter_cvec
//
-// TODO: rename to llama_adapter_cvec
-struct llama_control_vector {
- std::vector<ggml_context_ptr> ctxs;
- std::vector<ggml_backend_buffer_ptr> bufs;
+struct llama_adapter_cvec {
+ struct ggml_tensor * tensor_for(int il) const;
- std::vector<struct ggml_tensor *> tensors; // per layer
+ struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const;
+
+ int32_t apply(
+ const llama_model & model,
+ const float * data,
+ size_t len,
+ int32_t n_embd,
+ int32_t il_start,
+ int32_t il_end);
+
+private:
+ bool init(const llama_model & model);
int32_t layer_start = -1;
int32_t layer_end = -1;
- struct ggml_tensor * tensor_for(int il) const;
+ std::vector<ggml_context_ptr> ctxs;
+ std::vector<ggml_backend_buffer_ptr> bufs;
- struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const;
+ std::vector<struct ggml_tensor *> tensors; // per layer
};
-int32_t llama_control_vector_apply(
- struct llama_control_vector & cvec,
- const llama_model & model,
- const float * data,
- size_t len,
- int32_t n_embd,
- int32_t il_start,
- int32_t il_end);
-
//
// llama_adapter_lora
//
-// TODO: rename to llama_adapter_lora_weight
-struct llama_lora_weight {
+struct llama_adapter_lora_weight {
struct ggml_tensor * a = nullptr;
struct ggml_tensor * b = nullptr;
- llama_lora_weight() = default;
- llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
+ // get actual scale based on rank and alpha
+ float get_scale(float alpha, float adapter_scale) const {
+ const float rank = (float) b->ne[0];
+ const float scale = alpha ? adapter_scale * alpha / rank : adapter_scale;
+ return scale;
+ }
+
+ llama_adapter_lora_weight() = default;
+ llama_adapter_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
};
-// TODO: rename to llama_adapter_lora
-struct llama_lora_adapter {
+struct llama_adapter_lora {
// map tensor name to lora_a_b
- std::unordered_map<std::string, struct llama_lora_weight> ab_map;
+ std::unordered_map<std::string, struct llama_adapter_lora_weight> ab_map;
std::vector<ggml_context_ptr> ctxs;
std::vector<ggml_backend_buffer_ptr> bufs;
float alpha;
- llama_lora_adapter() = default;
- ~llama_lora_adapter() = default;
+ llama_adapter_lora() = default;
+ ~llama_adapter_lora() = default;
- llama_lora_weight * get_weight(struct ggml_tensor * w);
+ llama_adapter_lora_weight * get_weight(struct ggml_tensor * w);
};
{ LLM_ARCH_QWEN2VL, "qwen2vl" },
{ LLM_ARCH_PHI2, "phi2" },
{ LLM_ARCH_PHI3, "phi3" },
+ { LLM_ARCH_PHIMOE, "phimoe" },
{ LLM_ARCH_PLAMO, "plamo" },
{ LLM_ARCH_CODESHELL, "codeshell" },
{ LLM_ARCH_ORION, "orion" },
{ LLM_ARCH_NEMOTRON, "nemotron" },
{ LLM_ARCH_EXAONE, "exaone" },
{ LLM_ARCH_RWKV6, "rwkv6" },
+ { LLM_ARCH_RWKV6QWEN2, "rwkv6qwen2" },
{ LLM_ARCH_GRANITE, "granite" },
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_CHAMELEON, "chameleon" },
{ LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
+ { LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" },
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
{ LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
+ { LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" },
{ LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
{ LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
{ LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
+ {
+ LLM_ARCH_PHIMOE,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_OUTPUT, "output" },
+ { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
+ { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
+ },
+ },
{
LLM_ARCH_PLAMO,
{
{ LLM_TENSOR_TIME_MIX_LERP_V, "blk.%d.time_mix_lerp_v" },
{ LLM_TENSOR_TIME_MIX_LERP_R, "blk.%d.time_mix_lerp_r" },
{ LLM_TENSOR_TIME_MIX_LERP_G, "blk.%d.time_mix_lerp_g" },
+ { LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" },
{ LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix_first" },
{ LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix_decay" },
{ LLM_TENSOR_TIME_MIX_DECAY_W1, "blk.%d.time_mix_decay_w1" },
{ LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" },
},
},
+ {
+ LLM_ARCH_RWKV6QWEN2,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_OUTPUT, "output" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" },
+ { LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" },
+ { LLM_TENSOR_TIME_MIX_LERP_X, "blk.%d.time_mix_lerp_x" },
+ { LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" },
+ { LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix_first" },
+ { LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix_decay" },
+ { LLM_TENSOR_TIME_MIX_DECAY_W1, "blk.%d.time_mix_decay_w1" },
+ { LLM_TENSOR_TIME_MIX_DECAY_W2, "blk.%d.time_mix_decay_w2" },
+ { LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" },
+ { LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" },
+ { LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" },
+ { LLM_TENSOR_TIME_MIX_GATE, "blk.%d.time_mix_gate" },
+ { LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" },
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ },
+ },
{
LLM_ARCH_GRANITE,
{
{LLM_TENSOR_TIME_MIX_LERP_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
{LLM_TENSOR_TIME_MIX_LERP_R, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
{LLM_TENSOR_TIME_MIX_LERP_G, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+ {LLM_TENSOR_TIME_MIX_LERP_FUSED, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
{LLM_TENSOR_TIME_MIX_DECAY, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
{LLM_TENSOR_TIME_MIX_FIRST, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_RWKV_WKV6}},
{LLM_TENSOR_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
LLM_ARCH_QWEN2VL,
LLM_ARCH_PHI2,
LLM_ARCH_PHI3,
+ LLM_ARCH_PHIMOE,
LLM_ARCH_PLAMO,
LLM_ARCH_CODESHELL,
LLM_ARCH_ORION,
LLM_ARCH_NEMOTRON,
LLM_ARCH_EXAONE,
LLM_ARCH_RWKV6,
+ LLM_ARCH_RWKV6QWEN2,
LLM_ARCH_GRANITE,
LLM_ARCH_GRANITE_MOE,
LLM_ARCH_CHAMELEON,
LLM_KV_TIME_DECAY_EXTRA_DIM,
LLM_KV_RESIDUAL_SCALE,
LLM_KV_EMBEDDING_SCALE,
+ LLM_KV_TOKEN_SHIFT_COUNT,
LLM_KV_ATTENTION_HEAD_COUNT,
LLM_KV_ATTENTION_HEAD_COUNT_KV,
LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
LLM_KV_TOKENIZER_HF_JSON,
LLM_KV_TOKENIZER_RWKV,
+ LLM_KV_TOKENIZER_CHAT_TEMPLATE,
LLM_KV_TOKENIZER_FIM_PRE_ID,
LLM_KV_TOKENIZER_FIM_SUF_ID,
LLM_KV_TOKENIZER_FIM_MID_ID,
LLM_TENSOR_TIME_MIX_LERP_V,
LLM_TENSOR_TIME_MIX_LERP_R,
LLM_TENSOR_TIME_MIX_LERP_G,
+ LLM_TENSOR_TIME_MIX_LERP_FUSED,
LLM_TENSOR_TIME_MIX_FIRST,
LLM_TENSOR_TIME_MIX_DECAY,
LLM_TENSOR_TIME_MIX_DECAY_W1,
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
+ { "phi4", LLM_CHAT_TEMPLATE_PHI_4 },
{ "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
{ "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
{ "monarch", LLM_CHAT_TEMPLATE_MONARCH },
return tmpl.find(haystack) != std::string::npos;
};
if (tmpl_contains("<|im_start|>")) {
- return LLM_CHAT_TEMPLATE_CHATML;
+ return tmpl_contains("<|im_sep|>")
+ ? LLM_CHAT_TEMPLATE_PHI_4
+ : LLM_CHAT_TEMPLATE_CHATML;
} else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
if (tmpl_contains("[SYSTEM_PROMPT]")) {
return LLM_CHAT_TEMPLATE_MISTRAL_V7;
if (add_ass) {
ss << "<|assistant|>\n";
}
+ } else if (tmpl == LLM_CHAT_TEMPLATE_PHI_4) {
+ // chatml template
+ for (auto message : chat) {
+ ss << "<|im_start|>" << message->role << "<|im_sep|>" << message->content << "<|im_end|>";
+ }
+ if (add_ass) {
+ ss << "<|im_start|>assistant<|im_sep|>";
+ }
} else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
// Falcon 3
for (auto message : chat) {
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
LLM_CHAT_TEMPLATE_MISTRAL_V7,
LLM_CHAT_TEMPLATE_PHI_3,
+ LLM_CHAT_TEMPLATE_PHI_4,
LLM_CHAT_TEMPLATE_FALCON_3,
LLM_CHAT_TEMPLATE_ZEPHYR,
LLM_CHAT_TEMPLATE_MONARCH,
#include "llama-context.h"
+#include "llama-impl.h"
+#include "llama-mmap.h"
+
#include <cassert>
#include <cmath>
#include <cstring>
size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
const auto & cparams = lctx.cparams;
const auto & hparams = lctx.model.hparams;
+ const auto & vocab = lctx.model.vocab;
const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
const auto n_batch = cparams.n_batch;
- const auto n_vocab = hparams.n_vocab;
+ const auto n_vocab = vocab.n_tokens();
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
auto * buft = ggml_backend_cpu_buffer_type();
// try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
- auto * output_dev = lctx.model.dev_output.dev;
+ auto * output_dev = lctx.model.dev_output();
auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
if (output_dev_host_buft) {
buft = output_dev_host_buft;
void llama_output_reorder(struct llama_context & ctx) {
std::vector<size_t> & out_ids = ctx.sbatch.out_ids;
if (!out_ids.empty()) {
- const uint32_t n_vocab = ctx.model.hparams.n_vocab;
+ const uint32_t n_vocab = ctx.model.vocab.n_tokens();
const uint32_t n_embd = ctx.model.hparams.n_embd;
const int32_t n_outputs = ctx.n_outputs;
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
}
- return ctx->logits + j*ctx->model.hparams.n_vocab;
+ return ctx->logits + j*ctx->model.vocab.n_tokens();
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG
}
void write_logits(const struct llama_context * ctx) {
- const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_vocab);
+ const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.vocab.n_tokens());
write(&logits_size, sizeof(logits_size));
const struct llama_model & model;
- struct llama_cparams cparams;
- struct llama_sbatch sbatch; // TODO: revisit if needed
- struct llama_kv_cache kv_self;
- struct llama_control_vector cvec;
+ struct llama_cparams cparams;
+ struct llama_sbatch sbatch; // TODO: revisit if needed
+ struct llama_kv_cache kv_self;
+ struct llama_adapter_cvec cvec;
- std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
+ std::unordered_map<struct llama_adapter_lora *, float> lora;
std::vector<ggml_backend_ptr> backends;
std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
for (size_t i = 0; i < cur_p->size; ++i) {
const llama_token id = cur_p->data[i].id;
- const std::string & piece = grammar.vocab->cache_token_to_piece.at(id);
+ const std::string & piece = grammar.vocab->token_to_piece(id);
- if (llama_token_is_eog_impl(*grammar.vocab, id)) {
+ if (grammar.vocab->is_eog(id)) {
if (!allow_eog) {
cur_p->data[i].logit = -INFINITY;
}
void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
GGML_ASSERT(grammar.vocab != nullptr);
- if (llama_token_is_eog_impl(*grammar.vocab, token)) {
+ if (grammar.vocab->is_eog(token)) {
for (const auto & stack : grammar.stacks) {
if (stack.empty()) {
return;
GGML_ABORT("fatal error");
}
- const std::string & piece = grammar.vocab->cache_token_to_piece.at(token);
+ const std::string & piece = grammar.vocab->token_to_piece(token);
// Note terminating 0 in decoded string
const auto decoded = decode_utf8(piece, grammar.partial_utf8);
uint32_t llama_hparams::n_embd_k_s() const {
if (wkv_head_size != 0) {
// for RWKV models
- return 2 * n_embd;
+ return token_shift_count * n_embd;
}
// TODO: maybe support other convolution strides than 1
bool use_par_res;
bool swin_norm;
- uint32_t n_vocab = 0;
uint32_t n_ctx_train; // context size the model was trained on
uint32_t n_embd;
uint32_t n_embd_features = 0;
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
uint32_t n_expert = 0;
uint32_t n_expert_used = 0;
- uint32_t n_vocab_type = 0; // for BERT-style token types
uint32_t n_rel_attn_bkts = 0;
// for WavTokenizer
uint32_t time_mix_extra_dim = 0;
uint32_t time_decay_extra_dim = 0;
uint32_t wkv_head_size = 0;
+ uint32_t token_shift_count = 2;
float rope_attn_factor = 1.0f;
float rope_freq_base_train;
#include "llama-impl.h"
+#include "gguf.h"
#include "llama.h"
#include <cinttypes>
{
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
int arr_n = gguf_get_arr_n(ctx_gguf, i);
- const void * data = gguf_get_arr_data(ctx_gguf, i);
+ const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
std::stringstream ss;
ss << "[";
for (int j = 0; j < arr_n; j++) {
ggml_backend_buffer_type_t buft;
if (offload) {
- auto * dev = model.dev_layer.at(i).dev;
+ auto * dev = model.dev_layer(i);
buft = ggml_backend_dev_buffer_type(dev);
} else {
buft = ggml_backend_cpu_buffer_type();
// TODO: consider moving to llama-impl.h if needed in more places
#if defined(_WIN32)
-std::string llama_format_win_err(DWORD err) {
+static std::string llama_format_win_err(DWORD err) {
LPSTR buf;
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
#include <cstring>
#include <future>
+static const size_t kiB = 1024;
+static const size_t MiB = 1024*kiB;
+static const size_t GiB = 1024*MiB;
+
const char * llama_file_version_name(llama_fver version) {
switch (version) {
case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
return "unknown";
}
+static std::string llama_model_ftype_name(llama_ftype ftype) {
+ if (ftype & LLAMA_FTYPE_GUESSED) {
+ return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
+ }
+
+ switch (ftype) {
+ case LLAMA_FTYPE_ALL_F32: return "all F32";
+ case LLAMA_FTYPE_MOSTLY_F16: return "F16";
+ case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
+ case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
+ case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
+ case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
+ case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
+ case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
+ case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
+ case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
+ case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
+ case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
+ case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
+ case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
+ case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
+ case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
+ case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
+ case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
+ case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
+ case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
+ case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
+ case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
+ case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
+ case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
+ case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
+ case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
+ case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
+ case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
+ case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
+ case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
+ case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
+
+ default: return "unknown, may not work";
+ }
+}
+
namespace GGUFMeta {
- template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int)>
+ template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int64_t)>
struct GKV_Base_Type {
static constexpr gguf_type gt = gt_;
public:
static constexpr gguf_type gt = GGUF_TYPE_ARRAY;
static ArrayInfo getter(const gguf_context *ctx, const int k) {
+ const enum gguf_type arr_type = gguf_get_arr_type(ctx, k);
return ArrayInfo {
- gguf_get_arr_type(ctx, k),
+ arr_type,
size_t(gguf_get_arr_n(ctx, k)),
- gguf_get_arr_data(ctx, k),
+ arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx, k),
};
}
};
const enum gguf_type type = gguf_get_kv_type(meta.get(), i);
const std::string type_name =
type == GGUF_TYPE_ARRAY
- ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i))
+ ? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i))
: gguf_type_name(type);
std::string value = gguf_kv_to_str(meta.get(), i);
return true;
}
+
+std::string llama_model_loader::ftype_name() const {
+ return llama_model_ftype_name(ftype);
+}
+
+void llama_model_loader::print_info() const {
+ LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver));
+ LLAMA_LOG_INFO("%s: file type = %s\n", __func__, llama_model_ftype_name(ftype).c_str());
+ if (n_bytes < GiB) {
+ LLAMA_LOG_INFO("%s: file size = %.2f MiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0, n_bytes*8.0/n_elements);
+ } else {
+ LLAMA_LOG_INFO("%s: file size = %.2f GiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements);
+ }
+}
llama_mlocks * lmlocks,
llama_progress_callback progress_callback,
void * progress_callback_user_data);
+
+ std::string ftype_name() const;
+
+ void print_info() const;
};
#include "llama-model.h"
#include "llama-impl.h"
+#include "llama-mmap.h"
#include "llama-model-loader.h"
-#include "unicode.h" // TODO: remove
+#include "ggml-cpp.h"
#include <algorithm>
#include <cassert>
+#include <cstring>
#include <functional>
+#include <map>
#include <sstream>
#include <stdexcept>
-static const size_t kiB = 1024;
-static const size_t MiB = 1024*kiB;
-static const size_t GiB = 1024*MiB;
-
const char * llm_type_name(llm_type type) {
switch (type) {
- case MODEL_14M: return "14M";
- case MODEL_17M: return "17M";
- case MODEL_22M: return "22M";
- case MODEL_33M: return "33M";
- case MODEL_60M: return "60M";
- case MODEL_70M: return "70M";
- case MODEL_80M: return "80M";
- case MODEL_109M: return "109M";
- case MODEL_137M: return "137M";
- case MODEL_160M: return "160M";
- case MODEL_220M: return "220M";
- case MODEL_250M: return "250M";
- case MODEL_270M: return "270M";
- case MODEL_335M: return "335M";
- case MODEL_410M: return "410M";
- case MODEL_450M: return "450M";
- case MODEL_770M: return "770M";
- case MODEL_780M: return "780M";
- case MODEL_0_5B: return "0.5B";
- case MODEL_1B: return "1B";
- case MODEL_1_3B: return "1.3B";
- case MODEL_1_4B: return "1.4B";
- case MODEL_1_5B: return "1.5B";
- case MODEL_1_6B: return "1.6B";
- case MODEL_2B: return "2B";
- case MODEL_2_8B: return "2.8B";
- case MODEL_3B: return "3B";
- case MODEL_4B: return "4B";
- case MODEL_6B: return "6B";
- case MODEL_6_9B: return "6.9B";
- case MODEL_7B: return "7B";
- case MODEL_8B: return "8B";
- case MODEL_9B: return "9B";
- case MODEL_11B: return "11B";
- case MODEL_12B: return "12B";
- case MODEL_13B: return "13B";
- case MODEL_14B: return "14B";
- case MODEL_15B: return "15B";
- case MODEL_16B: return "16B";
- case MODEL_20B: return "20B";
- case MODEL_30B: return "30B";
- case MODEL_32B: return "32B";
- case MODEL_34B: return "34B";
- case MODEL_35B: return "35B";
- case MODEL_40B: return "40B";
- case MODEL_65B: return "65B";
- case MODEL_70B: return "70B";
- case MODEL_236B: return "236B";
- case MODEL_314B: return "314B";
- case MODEL_671B: return "671B";
- case MODEL_SMALL: return "0.1B";
- case MODEL_MEDIUM: return "0.4B";
- case MODEL_LARGE: return "0.8B";
- case MODEL_XL: return "1.5B";
- case MODEL_A1_7B: return "A1.7B";
- case MODEL_A2_7B: return "A2.7B";
- case MODEL_8x7B: return "8x7B";
- case MODEL_8x22B: return "8x22B";
- case MODEL_16x12B: return "16x12B";
- case MODEL_10B_128x3_66B: return "10B+128x3.66B";
- case MODEL_57B_A14B: return "57B.A14B";
- case MODEL_27B: return "27B";
- default: return "?B";
- }
-}
-
-static std::string llama_model_ftype_name(llama_ftype ftype) {
- if (ftype & LLAMA_FTYPE_GUESSED) {
- return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
- }
-
- switch (ftype) {
- case LLAMA_FTYPE_ALL_F32: return "all F32";
- case LLAMA_FTYPE_MOSTLY_F16: return "F16";
- case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
- case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
- case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
- case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
- case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
- case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
- case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
- case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
- case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
- case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
- case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
- case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
- case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
- case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
- case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
- case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
- case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
- case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
- case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
- case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
- case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
- case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
- case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
- case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
- case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
- case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
- case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
- case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
- case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
- case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
-
- default: return "unknown, may not work";
+ case LLM_TYPE_14M: return "14M";
+ case LLM_TYPE_17M: return "17M";
+ case LLM_TYPE_22M: return "22M";
+ case LLM_TYPE_33M: return "33M";
+ case LLM_TYPE_60M: return "60M";
+ case LLM_TYPE_70M: return "70M";
+ case LLM_TYPE_80M: return "80M";
+ case LLM_TYPE_109M: return "109M";
+ case LLM_TYPE_137M: return "137M";
+ case LLM_TYPE_160M: return "160M";
+ case LLM_TYPE_220M: return "220M";
+ case LLM_TYPE_250M: return "250M";
+ case LLM_TYPE_270M: return "270M";
+ case LLM_TYPE_335M: return "335M";
+ case LLM_TYPE_410M: return "410M";
+ case LLM_TYPE_450M: return "450M";
+ case LLM_TYPE_770M: return "770M";
+ case LLM_TYPE_780M: return "780M";
+ case LLM_TYPE_0_5B: return "0.5B";
+ case LLM_TYPE_1B: return "1B";
+ case LLM_TYPE_1_3B: return "1.3B";
+ case LLM_TYPE_1_4B: return "1.4B";
+ case LLM_TYPE_1_5B: return "1.5B";
+ case LLM_TYPE_1_6B: return "1.6B";
+ case LLM_TYPE_2B: return "2B";
+ case LLM_TYPE_2_8B: return "2.8B";
+ case LLM_TYPE_3B: return "3B";
+ case LLM_TYPE_4B: return "4B";
+ case LLM_TYPE_6B: return "6B";
+ case LLM_TYPE_6_9B: return "6.9B";
+ case LLM_TYPE_7B: return "7B";
+ case LLM_TYPE_8B: return "8B";
+ case LLM_TYPE_9B: return "9B";
+ case LLM_TYPE_11B: return "11B";
+ case LLM_TYPE_12B: return "12B";
+ case LLM_TYPE_13B: return "13B";
+ case LLM_TYPE_14B: return "14B";
+ case LLM_TYPE_15B: return "15B";
+ case LLM_TYPE_16B: return "16B";
+ case LLM_TYPE_20B: return "20B";
+ case LLM_TYPE_30B: return "30B";
+ case LLM_TYPE_32B: return "32B";
+ case LLM_TYPE_34B: return "34B";
+ case LLM_TYPE_35B: return "35B";
+ case LLM_TYPE_40B: return "40B";
+ case LLM_TYPE_65B: return "65B";
+ case LLM_TYPE_70B: return "70B";
+ case LLM_TYPE_236B: return "236B";
+ case LLM_TYPE_314B: return "314B";
+ case LLM_TYPE_671B: return "671B";
+ case LLM_TYPE_SMALL: return "0.1B";
+ case LLM_TYPE_MEDIUM: return "0.4B";
+ case LLM_TYPE_LARGE: return "0.8B";
+ case LLM_TYPE_XL: return "1.5B";
+ case LLM_TYPE_A1_7B: return "A1.7B";
+ case LLM_TYPE_A2_7B: return "A2.7B";
+ case LLM_TYPE_8x7B: return "8x7B";
+ case LLM_TYPE_8x22B: return "8x22B";
+ case LLM_TYPE_16x12B: return "16x12B";
+ case LLM_TYPE_16x3_8B: return "16x3.8B";
+ case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
+ case LLM_TYPE_57B_A14B: return "57B.A14B";
+ case LLM_TYPE_27B: return "27B";
+ default: return "?B";
}
}
}
}
-std::string llama_model_arch_name (const llama_model & model) {
- return llm_arch_name(model.arch);
-}
+static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
+ { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
+ { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
+ { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
+ { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
+};
-std::string llama_model_type_name (const llama_model & model) {
- return llm_type_name(model.type);
-}
+static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
+ for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
+ if (kv.second == name) {
+ return (llama_rope_scaling_type) kv.first;
+ }
+ }
-std::string llama_model_ftype_name(const llama_model & model) {
- return llama_model_ftype_name(model.ftype);
+ return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
}
-template<typename F>
-static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
+// checks if the weight tensor can be used with the specified buffer type and device
+static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
+ GGML_ASSERT(w != nullptr);
+
+ if (op == GGML_OP_NONE) {
+ return true;
+ }
+
ggml_init_params params = {
/*.mem_size =*/ ggml_tensor_overhead()*8,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
-
- ggml_context_ptr ctx { ggml_init(params) };
- if (!ctx) {
+ ggml_context_ptr ctx_ptr { ggml_init(params) };
+ if (!ctx_ptr) {
throw std::runtime_error(format("failed to create ggml context"));
}
+ ggml_context * ctx = ctx_ptr.get();
- ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
- ggml_tensor * op_tensor = fn(ctx.get());
- for (int i = 0; i < GGML_MAX_SRC; i++) {
- if (op_tensor->src[i] != nullptr) {
- assert(op_tensor->src[i]->buffer == nullptr);
- op_tensor->src[i]->buffer = buf.get();
- }
+ ggml_tensor * op_tensor = nullptr;
+
+ switch (op) {
+ case GGML_OP_GET_ROWS:
+ {
+ ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
+ op_tensor = ggml_get_rows(ctx, w, b);
+ } break;
+ case GGML_OP_MUL_MAT:
+ {
+ ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
+ op_tensor = ggml_mul_mat(ctx, w, b);
+ } break;
+ case GGML_OP_MUL_MAT_ID:
+ {
+ int n_expert_used = hparams.n_expert_used;
+ ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
+ ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
+ op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
+ } break;
+ case GGML_OP_ADD:
+ {
+ ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
+ op_tensor = ggml_add(ctx, a, w);
+ } break;
+ case GGML_OP_MUL:
+ {
+ ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
+ op_tensor = ggml_mul(ctx, a, w);
+ } break;
+ case GGML_OP_DIV:
+ {
+ ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
+ op_tensor = ggml_div(ctx, a, w);
+ } break;
+ case GGML_OP_ROPE:
+ {
+ int n_embd_head = hparams.n_embd_head_v;
+ int n_head = hparams.n_head();
+ ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
+ ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
+ op_tensor = ggml_rope_ext(
+ ctx, a, b, w,
+ 0, 0, 0, 0, 0,
+ 0, 0, 0, 0
+ );
+
+ } break;
+ case GGML_OP_SSM_CONV:
+ {
+ // FIXME
+ ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 12345, w->ne[1], 6789);
+ op_tensor = ggml_ssm_conv(ctx, conv_x, w);
+ } break;
+ case GGML_OP_SSM_SCAN:
+ {
+ // FIXME
+ const int64_t d_state = w->ne[0];
+ const int64_t d_inner = w->ne[1];
+ const int64_t n_seq_tokens = 512;
+ const int64_t n_seqs = 1;
+ ggml_tensor * s = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, d_inner, n_seqs);
+ ggml_tensor * x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
+ ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
+ ggml_tensor * B = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
+ ggml_tensor * C = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
+ op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C);
+ } break;
+ case GGML_OP_RWKV_WKV6:
+ {
+ // FIXME
+ const int64_t S = 123;
+ const int64_t H = 123;
+ const int64_t n_tokens = 123;
+ const int64_t n_seqs = 123;
+ ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
+ ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
+ ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
+ ggml_tensor * tf = w;
+ ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
+ ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
+ op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
+ } break;
+ case GGML_OP_IM2COL:
+ {
+ const int n_embd = hparams.n_embd;
+ ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
+ op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
+ } break;
+ default:
+ GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
}
+ // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
+ GGML_ASSERT(w->buffer == nullptr);
+ w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
+ ggml_backend_buffer_free(w->buffer);
+ w->buffer = nullptr;
return op_supported;
}
-template<typename F>
-static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
+// lists of buffer types used for each layer
+using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
+
+// find the first buffer type in the list that can use the tensor
+static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t & buft_list) {
+ GGML_ASSERT(!buft_list.empty());
for (const auto & cur : buft_list) {
ggml_backend_dev_t cur_dev = cur.first;
ggml_backend_buffer_type_t cur_buft = cur.second;
- if (buft_supported(cur_buft, cur_dev, fn)) {
+ if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
return cur_buft;
}
}
-
- throw std::runtime_error(format("no suitable buffer type found"));
+ return nullptr;
}
-ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il) {
- return select_buft(
- *model.dev_layer.at(il).buft_list,
- [&](ggml_context * ctx) {
- ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
- ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
- return ggml_add(ctx, cur, layer_dir);
- });
-}
+// CPU: ACCEL -> CPU extra -> GPU host -> CPU
+static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
+ buft_list_t buft_list;
+
+ // add ACCEL buffer types
+ for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+ ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+ if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
+ auto * buft = ggml_backend_dev_buffer_type(dev);
+ // skip
+ if (buft != ggml_backend_cpu_buffer_type()) {
+ buft_list.emplace_back(dev, buft);
+ }
+ }
+ }
-struct ggml_tensor * llama_model_get_tensor(const struct llama_model & model, const char * name) {
- auto it = std::find_if(model.tensors_by_name.begin(), model.tensors_by_name.end(),
- [name](const std::pair<std::string, struct ggml_tensor *> & it) {
- return it.first == name;
- });
- if (it == model.tensors_by_name.end()) {
- return nullptr;
+ // add extra buffer types
+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+ auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
+ auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
+ ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
+ if (ggml_backend_dev_get_extra_bufts_fn) {
+ ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
+ while (extra_bufts && *extra_bufts) {
+ buft_list.emplace_back(cpu_dev, *extra_bufts);
+ ++extra_bufts;
+ }
}
- return it->second;
-}
+ // add a host buffer type
+ // storing the tensors in a host buffer is useful when the processing of large batches
+ // is offloaded to a GPU device, since it reduces the time spent on data transfers
+ // generally, this will be done using the first device in the list
+ // a better approach would be to handle this on a weight-by-weight basis using the offload_op
+ // function of the device to determine if it would benefit from being stored in a host buffer
+ for (auto * dev : devices) {
+ ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
+ if (buft) {
+ buft_list.emplace_back(dev, buft);
+ break;
+ }
+ }
-size_t llama_model_max_nodes(const llama_model & model) {
- return std::max<size_t>(8192, model.tensors_by_name.size()*5);
-}
+ // add the CPU buffer type
+ for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+ ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+ if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
+ buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
+ }
+ }
-static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
- { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
- { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
- { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
- { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
-};
+ return buft_list;
+}
-static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
- for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
- if (kv.second == name) {
- return (llama_rope_scaling_type) kv.first;
+// GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU
+static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, enum llama_split_mode split_mode, const float * tensor_split) {
+ buft_list_t buft_list;
+
+ // add the device split buffer type if requested and available
+ if (split_mode == LLAMA_SPLIT_MODE_ROW) {
+ ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+ auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t)
+ ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
+ if (ggml_backend_split_buffer_type_fn) {
+ size_t dev_index = [&]() {
+ auto * reg = ggml_backend_dev_backend_reg(dev);
+ for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
+ if (ggml_backend_reg_dev_get(reg, i) == dev) {
+ return i;
+ }
+ }
+ throw std::runtime_error(format("device %s not found in its backend reg", ggml_backend_dev_name(dev)));
+ }();
+ auto * buft = ggml_backend_split_buffer_type_fn(dev_index, tensor_split);
+ if (buft != nullptr) {
+ buft_list.emplace_back(dev, buft);
+ }
}
}
- return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
+ // add the device default buffer type
+ buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
+
+ return buft_list;
}
-// NOTE: avoid ever using this except for building the token_to_piece caches
-static std::string llama_token_to_piece(const struct llama_model * model, llama_token token, bool special) {
- std::string piece;
- piece.resize(piece.capacity()); // using string internal cache
- const int n_chars = llama_token_to_piece(model, token, &piece[0], piece.size(), 0, special);
- if (n_chars < 0) {
- piece.resize(-n_chars);
- int check = llama_token_to_piece(model, token, &piece[0], piece.size(), 0, special);
- GGML_ASSERT(check == -n_chars);
- }
- else {
- piece.resize(n_chars);
- }
+struct llama_model::impl {
+ impl() {}
+ ~impl() {}
+
+ uint64_t n_elements = 0;
+
+ size_t n_bytes = 0;
+
+ std::string desc_str;
+
+ // model memory mapped files
+ llama_mmaps mappings;
+
+ // objects representing data potentially being locked in memory
+ llama_mlocks mlock_bufs;
+ llama_mlocks mlock_mmaps;
+
+ // contexts where the model tensors metadata is stored
+ std::vector<ggml_context_ptr> ctxs;
+
+ // the model memory buffers for the tensor data
+ std::vector<ggml_backend_buffer_ptr> bufs;
+
+ buft_list_t cpu_buft_list;
+ std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
+
+ struct layer_dev {
+ ggml_backend_dev_t dev;
+ buft_list_t * buft_list;
+ };
- return piece;
+ layer_dev dev_input = {};
+ layer_dev dev_output = {};
+ std::vector<layer_dev> dev_layer;
+};
+
+llama_model::llama_model(const struct llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
}
-void llm_load_stats(llama_model_loader & ml, llama_model & model) {
- model.n_elements = ml.n_elements;
- model.n_bytes = ml.n_bytes;
+llama_model::~llama_model() {}
+
+void llama_model::load_stats(llama_model_loader & ml) {
+ pimpl->n_elements = ml.n_elements;
+ pimpl->n_bytes = ml.n_bytes;
}
-void llm_load_arch(llama_model_loader & ml, llama_model & model) {
- model.arch = ml.get_arch();
- if (model.arch == LLM_ARCH_UNKNOWN) {
+void llama_model::load_arch(llama_model_loader & ml) {
+ arch = ml.get_arch();
+ if (arch == LLM_ARCH_UNKNOWN) {
throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
}
}
-void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
- auto & hparams = model.hparams;
+void llama_model::load_hparams(llama_model_loader & ml) {
const gguf_context * ctx = ml.meta.get();
// get metadata as string
}
const char * name = gguf_get_key(ctx, i);
const std::string value = gguf_kv_to_str(ctx, i);
- model.gguf_kv.emplace(name, value);
+ gguf_kv.emplace(name, value);
}
// get general kv
- ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
-
- // get hparams kv
- ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
+ ml.get_key(LLM_KV_GENERAL_NAME, name, false);
// everything past this point is not vocab-related
if (hparams.vocab_only) {
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
- if (model.arch == LLM_ARCH_WAVTOKENIZER_DEC) {
+ if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
- if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_FALCON) {
+ if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
if (hparams.n_rot != hparams.n_embd_head_k) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
}
hparams.n_embd_head_v = 0;
}
- using e_model = llm_type; // TMP
+ // for differentiating model types
+ uint32_t n_vocab = 0;
+ ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
// arch-specific KVs
- switch (model.arch) {
+ switch (arch) {
case LLM_ARCH_LLAMA:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
if (hparams.n_expert == 8) {
switch (hparams.n_layer) {
- case 32: model.type = e_model::MODEL_8x7B; break;
- case 56: model.type = e_model::MODEL_8x22B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 32: type = LLM_TYPE_8x7B; break;
+ case 56: type = LLM_TYPE_8x22B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} else {
switch (hparams.n_layer) {
- case 16: model.type = e_model::MODEL_1B; break; // Llama 3.2 1B
- case 22: model.type = e_model::MODEL_1B; break;
- case 26: model.type = e_model::MODEL_3B; break;
- case 28: model.type = e_model::MODEL_3B; break; // Llama 3.2 3B
+ case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B
+ case 22: type = LLM_TYPE_1B; break;
+ case 26: type = LLM_TYPE_3B; break;
+ case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
// granite uses a vocab with len 49152
- case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
- case 36: model.type = e_model::MODEL_8B; break; // granite
- case 40: model.type = e_model::MODEL_13B; break;
- case 48: model.type = e_model::MODEL_34B; break;
- case 60: model.type = e_model::MODEL_30B; break;
- case 80: model.type = hparams.n_head() == hparams.n_head_kv() ? e_model::MODEL_65B : e_model::MODEL_70B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
+ case 36: type = LLM_TYPE_8B; break; // granite
+ case 40: type = LLM_TYPE_13B; break;
+ case 48: type = LLM_TYPE_34B; break;
+ case 60: type = LLM_TYPE_30B; break;
+ case 80: type = hparams.n_head() == hparams.n_head_kv() ? LLM_TYPE_65B : LLM_TYPE_70B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
}
} break;
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
- case 32: model.type = e_model::MODEL_7B; break;
- case 80: model.type = e_model::MODEL_70B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 32: type = LLM_TYPE_7B; break;
+ case 80: type = LLM_TYPE_70B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_MINICPM:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
- ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
- ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
+ ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
+ ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
switch (hparams.n_layer) {
- case 52: model.type = e_model::MODEL_1B; break;
- case 40: model.type = e_model::MODEL_2B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 52: type = LLM_TYPE_1B; break;
+ case 40: type = LLM_TYPE_2B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_MINICPM3:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
- ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
+ ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
+ ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
switch (hparams.n_layer) {
- case 62: model.type = e_model::MODEL_4B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 62: type = LLM_TYPE_4B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_GROK:
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
- case 64: model.type = e_model::MODEL_314B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 64: type = LLM_TYPE_314B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_FALCON:
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
switch (hparams.n_layer) {
- case 32: model.type = e_model::MODEL_7B; break;
- case 60: model.type = e_model::MODEL_40B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 32: type = LLM_TYPE_7B; break;
+ case 60: type = LLM_TYPE_40B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_BAICHUAN:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
- case 32: model.type = e_model::MODEL_7B; break;
- case 40: model.type = e_model::MODEL_13B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 32: type = LLM_TYPE_7B; break;
+ case 40: type = LLM_TYPE_13B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
- if (model.type == e_model::MODEL_13B) {
+ if (type == LLM_TYPE_13B) {
// TODO: become GGUF KV parameter
hparams.f_max_alibi_bias = 8.0f;
}
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
switch (hparams.n_layer) {
- case 24: model.type = e_model::MODEL_1B; break;
- case 36: model.type = e_model::MODEL_3B; break;
- case 42: model.type = e_model::MODEL_7B; break;
- case 40: model.type = e_model::MODEL_15B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 24: type = LLM_TYPE_1B; break;
+ case 36: type = LLM_TYPE_3B; break;
+ case 42: type = LLM_TYPE_7B; break;
+ case 40: type = LLM_TYPE_15B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_REFACT:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
- case 32: model.type = e_model::MODEL_1B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 32: type = LLM_TYPE_1B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
// TODO: become GGUF KV parameter
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
- ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
switch (hparams.n_layer) {
case 3:
- model.type = e_model::MODEL_17M; break; // bge-micro
+ type = LLM_TYPE_17M; break; // bge-micro
case 6:
- model.type = e_model::MODEL_22M; break; // MiniLM-L6
+ type = LLM_TYPE_22M; break; // MiniLM-L6
case 12:
switch (hparams.n_embd) {
- case 384: model.type = e_model::MODEL_33M; break; // MiniLM-L12, bge-small
- case 768: model.type = e_model::MODEL_109M; break; // bge-base
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 384: type = LLM_TYPE_33M; break; // MiniLM-L12, bge-small
+ case 768: type = LLM_TYPE_109M; break; // bge-base
+ default: type = LLM_TYPE_UNKNOWN;
} break;
case 24:
- model.type = e_model::MODEL_335M; break; // bge-large
- default: model.type = e_model::MODEL_UNKNOWN;
+ type = LLM_TYPE_335M; break; // bge-large
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_JINA_BERT_V2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
- ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
hparams.f_max_alibi_bias = 8.0f;
switch (hparams.n_layer) {
- case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
- case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 4: type = LLM_TYPE_33M; break; // jina-embeddings-small
+ case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_NOMIC_BERT:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
- ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
- model.type = e_model::MODEL_137M;
+ type = LLM_TYPE_137M;
}
} break;
case LLM_ARCH_BLOOM:
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
switch (hparams.n_layer) {
- case 24: model.type = e_model::MODEL_1B; break;
+ case 24: type = LLM_TYPE_1B; break;
case 30:
switch (hparams.n_embd) {
- case 2560: model.type = e_model::MODEL_3B; break;
- case 4096: model.type = e_model::MODEL_7B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 2560: type = LLM_TYPE_3B; break;
+ case 4096: type = LLM_TYPE_7B; break;
+ default: type = LLM_TYPE_UNKNOWN;
} break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ default: type = LLM_TYPE_UNKNOWN;
}
// TODO: become GGUF KV parameter
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
switch (hparams.n_layer) {
- case 32: model.type = e_model::MODEL_7B; break;
- case 48: model.type = e_model::MODEL_30B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 32: type = LLM_TYPE_7B; break;
+ case 48: type = LLM_TYPE_30B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_STABLELM:
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
switch (hparams.n_layer) {
- case 24: model.type = e_model::MODEL_1B; break;
- case 32: model.type = e_model::MODEL_3B; break;
- case 40: model.type = e_model::MODEL_12B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 24: type = LLM_TYPE_1B; break;
+ case 32: type = LLM_TYPE_3B; break;
+ case 40: type = LLM_TYPE_12B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_QWEN:
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
- case 32: model.type = e_model::MODEL_7B; break;
- case 40: model.type = e_model::MODEL_13B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 32: type = LLM_TYPE_7B; break;
+ case 40: type = LLM_TYPE_13B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_QWEN2VL:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
- case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
- case 28: model.type = hparams.n_embd == 1536 ? e_model::MODEL_1_5B : e_model::MODEL_7B; break;
- case 32: model.type = e_model::MODEL_7B; break;
- case 36: model.type = e_model::MODEL_3B; break;
- case 40: model.type = hparams.n_head() == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
- case 48: model.type = e_model::MODEL_14B; break;
- case 64: model.type = e_model::MODEL_32B; break;
- case 80: model.type = e_model::MODEL_70B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
+ case 28: type = hparams.n_embd == 1536 ? LLM_TYPE_1_5B : LLM_TYPE_7B; break;
+ case 32: type = LLM_TYPE_7B; break;
+ case 36: type = LLM_TYPE_3B; break;
+ case 40: type = hparams.n_head() == 20 ? LLM_TYPE_4B : LLM_TYPE_13B; break;
+ case 48: type = LLM_TYPE_14B; break;
+ case 64: type = LLM_TYPE_32B; break;
+ case 80: type = LLM_TYPE_70B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_QWEN2MOE:
{
- ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
- case 24: model.type = e_model::MODEL_A2_7B; break;
- case 28: model.type = e_model::MODEL_57B_A14B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 24: type = LLM_TYPE_A2_7B; break;
+ case 28: type = LLM_TYPE_57B_A14B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_PHI2:
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
switch (hparams.n_layer) {
- case 24: model.type = e_model::MODEL_1B; break;
- case 32: model.type = e_model::MODEL_3B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 24: type = LLM_TYPE_1B; break;
+ case 32: type = LLM_TYPE_3B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_PHI3:
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
- case 24: model.type = e_model::MODEL_1B; break;
- case 32: model.type = e_model::MODEL_3B; break;
- case 40: model.type = e_model::MODEL_14B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 24: type = LLM_TYPE_1B; break;
+ case 32: type = LLM_TYPE_3B; break;
+ case 40: type = LLM_TYPE_14B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
// for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
throw std::runtime_error("invalid value for sliding_window");
}
} break;
+ case LLM_ARCH_PHIMOE:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ switch (hparams.n_layer) {
+ case 32: type = LLM_TYPE_16x3_8B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
case LLM_ARCH_PLAMO:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
- case 40: model.type = e_model::MODEL_13B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 40: type = LLM_TYPE_13B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_GPT2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
switch (hparams.n_layer) {
- case 12: model.type = e_model::MODEL_SMALL; break;
- case 24: model.type = e_model::MODEL_MEDIUM; break;
- case 36: model.type = e_model::MODEL_LARGE; break;
- case 48: model.type = e_model::MODEL_XL; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 12: type = LLM_TYPE_SMALL; break;
+ case 24: type = LLM_TYPE_MEDIUM; break;
+ case 36: type = LLM_TYPE_LARGE; break;
+ case 48: type = LLM_TYPE_XL; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_CODESHELL:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
switch (hparams.n_layer) {
- case 42: model.type = e_model::MODEL_7B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 42: type = LLM_TYPE_7B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_ORION:
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
switch (hparams.n_layer) {
- case 40: model.type = e_model::MODEL_14B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 40: type = LLM_TYPE_14B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_INTERNLM2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
- case 32: model.type = e_model::MODEL_7B; break;
- case 48: model.type = e_model::MODEL_20B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 32: type = LLM_TYPE_7B; break;
+ case 48: type = LLM_TYPE_20B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_GEMMA:
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
- case 18: model.type = e_model::MODEL_2B; break;
- case 28: model.type = e_model::MODEL_7B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 18: type = LLM_TYPE_2B; break;
+ case 28: type = LLM_TYPE_7B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_GEMMA2:
{
hparams.n_swa = 4096; // default value of gemma 2
- ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
- ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
+ ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
+ ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
hparams.attn_soft_cap = true;
switch (hparams.n_layer) {
- case 26: model.type = e_model::MODEL_2B; break;
- case 42: model.type = e_model::MODEL_9B; break;
- case 46: model.type = e_model::MODEL_27B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 26: type = LLM_TYPE_2B; break;
+ case 42: type = LLM_TYPE_9B; break;
+ case 46: type = LLM_TYPE_27B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_STARCODER2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
switch (hparams.n_layer) {
- case 30: model.type = e_model::MODEL_3B; break;
- case 32: model.type = e_model::MODEL_7B; break;
- case 40: model.type = e_model::MODEL_15B; break;
- case 52: model.type = e_model::MODEL_20B; break; // granite
- case 88: model.type = e_model::MODEL_34B; break; // granite
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 30: type = LLM_TYPE_3B; break;
+ case 32: type = LLM_TYPE_7B; break;
+ case 40: type = LLM_TYPE_15B; break;
+ case 52: type = LLM_TYPE_20B; break; // granite
+ case 88: type = LLM_TYPE_34B; break; // granite
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_MAMBA:
switch (hparams.n_layer) {
case 24:
switch (hparams.n_embd) {
- case 768: model.type = e_model::MODEL_SMALL; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 768: type = LLM_TYPE_SMALL; break;
+ default: type = LLM_TYPE_UNKNOWN;
} break;
case 48:
switch (hparams.n_embd) {
- case 1024: model.type = e_model::MODEL_MEDIUM; break;
- case 1536: model.type = e_model::MODEL_LARGE; break;
- case 2048: model.type = e_model::MODEL_XL; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 1024: type = LLM_TYPE_MEDIUM; break;
+ case 1536: type = LLM_TYPE_LARGE; break;
+ case 2048: type = LLM_TYPE_XL; break;
+ default: type = LLM_TYPE_UNKNOWN;
} break;
case 64:
switch (hparams.n_embd) {
- case 2560: model.type = e_model::MODEL_3B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 2560: type = LLM_TYPE_3B; break;
+ default: type = LLM_TYPE_UNKNOWN;
} break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_XVERSE:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
- case 32: model.type = e_model::MODEL_7B; break;
- case 40: model.type = e_model::MODEL_13B; break;
- case 80: model.type = e_model::MODEL_65B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 32: type = LLM_TYPE_7B; break;
+ case 40: type = LLM_TYPE_13B; break;
+ case 80: type = LLM_TYPE_65B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_COMMAND_R:
{
- ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
switch (hparams.n_layer) {
- case 40: model.type = e_model::MODEL_35B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 40: type = LLM_TYPE_35B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_COHERE2:
{
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
- ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
switch (hparams.n_layer) {
- case 32: model.type = e_model::MODEL_8B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 32: type = LLM_TYPE_8B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_DBRX:
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
switch (hparams.n_layer) {
- case 40: model.type = e_model::MODEL_16x12B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 40: type = LLM_TYPE_16x12B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_OLMO:
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
switch (hparams.n_layer) {
- case 22: model.type = e_model::MODEL_1B; break;
- case 32: model.type = e_model::MODEL_7B; break;
- case 80: model.type = e_model::MODEL_70B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 22: type = LLM_TYPE_1B; break;
+ case 32: type = LLM_TYPE_7B; break;
+ case 80: type = LLM_TYPE_70B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_OLMO2:
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
- case 16: model.type = e_model::MODEL_1B; break;
- case 32: model.type = e_model::MODEL_7B; break;
- case 40: model.type = e_model::MODEL_13B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 16: type = LLM_TYPE_1B; break;
+ case 32: type = LLM_TYPE_7B; break;
+ case 40: type = LLM_TYPE_13B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_OLMOE:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
- case 16: model.type = e_model::MODEL_A1_7B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 16: type = LLM_TYPE_A1_7B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_OPENELM:
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
- case 16: model.type = e_model::MODEL_270M; break;
- case 20: model.type = e_model::MODEL_450M; break;
- case 28: model.type = e_model::MODEL_1B; break;
- case 36: model.type = e_model::MODEL_3B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 16: type = LLM_TYPE_270M; break;
+ case 20: type = LLM_TYPE_450M; break;
+ case 28: type = LLM_TYPE_1B; break;
+ case 36: type = LLM_TYPE_3B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_GPTNEOX:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
+ ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
switch (hparams.n_layer) {
case 6:
switch (hparams.n_ff()) {
- case 512: model.type = e_model::MODEL_14M; break;
- case 2048: model.type = e_model::MODEL_70M; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 512: type = LLM_TYPE_14M; break;
+ case 2048: type = LLM_TYPE_70M; break;
+ default: type = LLM_TYPE_UNKNOWN;
} break;
case 12:
switch (hparams.n_ff()) {
- case 3072: model.type = e_model::MODEL_160M; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 3072: type = LLM_TYPE_160M; break;
+ default: type = LLM_TYPE_UNKNOWN;
} break;
case 16:
switch (hparams.n_ff()) {
- case 8192: model.type = e_model::MODEL_1B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 8192: type = LLM_TYPE_1B; break;
+ default: type = LLM_TYPE_UNKNOWN;
} break;
case 24:
switch (hparams.n_ff()) {
- case 4096: model.type = e_model::MODEL_410M; break;
- case 8192: model.type = e_model::MODEL_1_4B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 4096: type = LLM_TYPE_410M; break;
+ case 8192: type = LLM_TYPE_1_4B; break;
+ default: type = LLM_TYPE_UNKNOWN;
} break;
case 32:
switch (hparams.n_ff()) {
- case 10240: model.type = e_model::MODEL_2_8B; break;
- case 16384: model.type = e_model::MODEL_6_9B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 10240: type = LLM_TYPE_2_8B; break;
+ case 16384: type = LLM_TYPE_6_9B; break;
+ default: type = LLM_TYPE_UNKNOWN;
} break;
case 36:
switch (hparams.n_ff()) {
- case 20480: model.type = e_model::MODEL_12B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 20480: type = LLM_TYPE_12B; break;
+ default: type = LLM_TYPE_UNKNOWN;
} break;
case 44:
switch (hparams.n_ff()) {
- case 24576: model.type = e_model::MODEL_20B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 24576: type = LLM_TYPE_20B; break;
+ default: type = LLM_TYPE_UNKNOWN;
} break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_ARCTIC:
if (hparams.n_expert == 128) {
switch (hparams.n_layer) {
- case 35: model.type = e_model::MODEL_10B_128x3_66B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 35: type = LLM_TYPE_10B_128x3_66B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} else {
- model.type = e_model::MODEL_UNKNOWN;
+ type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_DEEPSEEK:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
- ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
- ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
switch (hparams.n_layer) {
- case 28: model.type = e_model::MODEL_20B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 28: type = LLM_TYPE_20B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_DEEPSEEK2:
{
bool is_lite = (hparams.n_layer == 27);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
if (!is_lite) {
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
}
- ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
+ ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
- ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
- ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
// for compatibility with existing DeepSeek V2 and V2.5 GGUFs
// that have no expert_gating_func model parameter set
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
switch (hparams.n_layer) {
- case 27: model.type = e_model::MODEL_16B; break;
- case 60: model.type = e_model::MODEL_236B; break;
- case 61: model.type = e_model::MODEL_671B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 27: type = LLM_TYPE_16B; break;
+ case 60: type = LLM_TYPE_236B; break;
+ case 61: type = LLM_TYPE_671B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_CHATGLM:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
- case 28: model.type = e_model::MODEL_6B; break;
- case 40: model.type = e_model::MODEL_9B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 28: type = LLM_TYPE_6B; break;
+ case 40: type = LLM_TYPE_9B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_BITNET:
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
- case 26: model.type = e_model::MODEL_3B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 26: type = LLM_TYPE_3B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_T5:
{
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
uint32_t dec_start_token_id;
}
switch (hparams.n_layer) {
- case 6: model.type = e_model::MODEL_60M; break; // t5-small
- case 8: model.type = e_model::MODEL_80M; break; // flan-t5-small
+ case 6: type = LLM_TYPE_60M; break; // t5-small
+ case 8: type = LLM_TYPE_80M; break; // flan-t5-small
case 12:
switch (hparams.n_ff()) {
- case 3072: model.type = e_model::MODEL_220M; break; // t5-base
- case 2048: model.type = e_model::MODEL_250M; break; // flan-t5-base
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 3072: type = LLM_TYPE_220M; break; // t5-base
+ case 2048: type = LLM_TYPE_250M; break; // flan-t5-base
+ default: type = LLM_TYPE_UNKNOWN;
} break;
case 24:
switch (hparams.n_ff()) {
- case 4096: model.type = e_model::MODEL_770M; break; // t5-large
- case 2816: model.type = e_model::MODEL_780M; break; // flan-t5-large
- case 16384: model.type = e_model::MODEL_3B; break; // t5-3b
- case 5120: model.type = e_model::MODEL_3B; break; // flan-t5-xl
- case 65536: model.type = e_model::MODEL_11B; break; // t5-11b
- case 10240: model.type = e_model::MODEL_11B; break; // flan-t5-xxl
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 4096: type = LLM_TYPE_770M; break; // t5-large
+ case 2816: type = LLM_TYPE_780M; break; // flan-t5-large
+ case 16384: type = LLM_TYPE_3B; break; // t5-3b
+ case 5120: type = LLM_TYPE_3B; break; // flan-t5-xl
+ case 65536: type = LLM_TYPE_11B; break; // t5-11b
+ case 10240: type = LLM_TYPE_11B; break; // flan-t5-xxl
+ default: type = LLM_TYPE_UNKNOWN;
} break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_T5ENCODER:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
- model.type = e_model::MODEL_UNKNOWN;
+ type = LLM_TYPE_UNKNOWN;
} break;
case LLM_ARCH_JAIS:
{
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
switch (hparams.n_layer) {
- case 24: model.type = e_model::MODEL_1_3B; break;
- case 40: model.type = e_model::MODEL_13B; break;
+ case 24: type = LLM_TYPE_1_3B; break;
+ case 40: type = LLM_TYPE_13B; break;
/* TODO: add variants */
- default: model.type = e_model::MODEL_UNKNOWN;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_NEMOTRON:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
switch (hparams.n_layer) {
- case 32: model.type = e_model::MODEL_4B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 32: type = LLM_TYPE_4B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_EXAONE:
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
- case 32: model.type = e_model::MODEL_8B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 32: type = LLM_TYPE_8B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_RWKV6:
+ case LLM_ARCH_RWKV6QWEN2:
{
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
- ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
- ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
- ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false);
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
+ ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
+ ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
+ ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
+ ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
+ ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
switch (hparams.n_layer) {
- case 24: model.type = e_model::MODEL_1_6B; break;
+ case 24: type = LLM_TYPE_1_6B; break;
case 32:
switch (hparams.n_embd) {
- case 2560: model.type = e_model::MODEL_3B; break;
- case 4096: model.type = e_model::MODEL_7B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 2560: type = LLM_TYPE_3B; break;
+ case 4096: type = LLM_TYPE_7B; break;
+ default: type = LLM_TYPE_UNKNOWN;
} break;
- case 61: model.type = e_model::MODEL_14B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 61: type = LLM_TYPE_14B; break;
+ case 64: type = LLM_TYPE_32B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
- ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
- ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
- ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
+ ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
+ ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
+ ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
switch (hparams.n_layer) {
- case 32: model.type = e_model::MODEL_3B; break;
- case 40: model.type = e_model::MODEL_3B; break;
+ case 32: type = LLM_TYPE_3B; break;
+ case 40: type = LLM_TYPE_3B; break;
// Add additional layer/vocab/etc checks here for other model sizes
- default: model.type = e_model::MODEL_UNKNOWN;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_CHAMELEON:
ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
switch (hparams.n_layer) {
- case 32: model.type = e_model::MODEL_7B; break;
- case 48: model.type = e_model::MODEL_34B; break;
- default: model.type = e_model::MODEL_UNKNOWN;
+ case 32: type = LLM_TYPE_7B; break;
+ case 48: type = LLM_TYPE_34B; break;
+ default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_WAVTOKENIZER_DEC:
default: throw std::runtime_error("unsupported model architecture");
}
- model.ftype = ml.ftype;
+ pimpl->n_bytes = ml.n_bytes;
+
+ pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
if (hparams.f_max_alibi_bias > 0.0f) {
hparams.use_alibi = true;
}
- hparams.rope_type = llama_rope_type(&model);
+ hparams.rope_type = llama_model_rope_type(this);
}
-void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
- auto & vocab = model.vocab;
+void llama_model::load_vocab(llama_model_loader & ml) {
+ const auto kv = LLM_KV(arch);
- struct gguf_context * ctx = ml.meta.get();
-
- const auto kv = LLM_KV(model.arch);
-
- // determine vocab type
- {
- std::string tokenizer_model;
- std::string tokenizer_pre;
-
- ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
- ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
-
- if (tokenizer_model == "no_vocab" || tokenizer_model == "none") {
- vocab.type = LLAMA_VOCAB_TYPE_NONE;
-
- // default special tokens
- vocab.special_bos_id = LLAMA_TOKEN_NULL;
- vocab.special_eos_id = LLAMA_TOKEN_NULL;
- vocab.special_unk_id = LLAMA_TOKEN_NULL;
- vocab.special_sep_id = LLAMA_TOKEN_NULL;
- vocab.special_pad_id = LLAMA_TOKEN_NULL;
- vocab.special_cls_id = LLAMA_TOKEN_NULL;
- vocab.special_mask_id = LLAMA_TOKEN_NULL;
- vocab.linefeed_id = LLAMA_TOKEN_NULL;
-
- // read vocab size from metadata
- if (!ml.get_key(LLM_KV_VOCAB_SIZE, vocab.n_vocab, false)) {
- vocab.n_vocab = 0;
- LLAMA_LOG_WARN("%s: there is no vocab_size in metadata, vocab.n_vocab will be set to %u\n", __func__, vocab.n_vocab);
- }
- return;
- }
-
- if (tokenizer_model == "llama") {
- vocab.type = LLAMA_VOCAB_TYPE_SPM;
-
- // default special tokens
- vocab.special_bos_id = 1;
- vocab.special_eos_id = 2;
- vocab.special_unk_id = 0;
- vocab.special_sep_id = LLAMA_TOKEN_NULL;
- vocab.special_pad_id = LLAMA_TOKEN_NULL;
- vocab.special_cls_id = LLAMA_TOKEN_NULL;
- vocab.special_mask_id = LLAMA_TOKEN_NULL;
- } else if (tokenizer_model == "bert") {
- vocab.type = LLAMA_VOCAB_TYPE_WPM;
-
- // default special tokens
- vocab.special_bos_id = LLAMA_TOKEN_NULL;
- vocab.special_eos_id = LLAMA_TOKEN_NULL;
- vocab.special_unk_id = 100;
- vocab.special_sep_id = 102;
- vocab.special_pad_id = 0;
- vocab.special_cls_id = 101;
- vocab.special_mask_id = 103;
- } else if (tokenizer_model == "gpt2") {
- vocab.type = LLAMA_VOCAB_TYPE_BPE;
-
- // read bpe merges and populate bpe ranks
- const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
- if (merges_keyidx == -1) {
- throw std::runtime_error("cannot find tokenizer merges in model file\n");
- }
-
- const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
- for (int i = 0; i < n_merges; i++) {
- const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
- GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
-
- std::string first;
- std::string second;
-
- const size_t pos = word.find(' ', 1);
-
- if (pos != std::string::npos) {
- first = word.substr(0, pos);
- second = word.substr(pos + 1);
- }
+ vocab.load(ml, kv);
+}
- vocab.bpe_ranks.emplace(std::make_pair(first, second), i);
- }
+bool llama_model::load_tensors(llama_model_loader & ml) {
+ const auto & split_mode = params.split_mode;
+ const auto & n_gpu_layers = params.n_gpu_layers;
+ const auto & use_mlock = params.use_mlock;
+ const auto & tensor_split = params.tensor_split;
- // default special tokens
- vocab.special_bos_id = 11;
- vocab.special_eos_id = 11;
- vocab.special_unk_id = LLAMA_TOKEN_NULL;
- vocab.special_sep_id = LLAMA_TOKEN_NULL;
- vocab.special_pad_id = LLAMA_TOKEN_NULL;
- vocab.special_cls_id = LLAMA_TOKEN_NULL;
- vocab.special_mask_id = LLAMA_TOKEN_NULL;
- } else if (tokenizer_model == "t5") {
- vocab.type = LLAMA_VOCAB_TYPE_UGM;
-
- // default special tokens
- vocab.special_bos_id = LLAMA_TOKEN_NULL;
- vocab.special_eos_id = 1;
- vocab.special_unk_id = 2;
- vocab.special_sep_id = LLAMA_TOKEN_NULL;
- vocab.special_pad_id = 0;
- vocab.special_cls_id = LLAMA_TOKEN_NULL;
- vocab.special_mask_id = LLAMA_TOKEN_NULL;
-
- const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
- if (precompiled_charsmap_keyidx != -1) {
- size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
- const char * precompiled_charsmap = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
- vocab.precompiled_charsmap.assign(precompiled_charsmap, precompiled_charsmap + n_precompiled_charsmap);
-#ifdef IS_BIG_ENDIAN
- // correct endiannes of data in precompiled_charsmap binary blob
- uint32_t * xcda_blob_size = (uint32_t *) &vocab.precompiled_charsmap[0];
- *xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
- assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
- size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t);
- uint32_t * xcda_array = (uint32_t *) &vocab.precompiled_charsmap[sizeof(uint32_t)];
- for (size_t i = 0; i < xcda_array_size; ++i) {
- xcda_array[i] = __builtin_bswap32(xcda_array[i]);
- }
-#endif
- }
- } else if (tokenizer_model == "rwkv") {
- vocab.type = LLAMA_VOCAB_TYPE_RWKV;
-
- // default special tokens
- vocab.special_bos_id = LLAMA_TOKEN_NULL;
- vocab.special_eos_id = LLAMA_TOKEN_NULL;
- vocab.special_unk_id = LLAMA_TOKEN_NULL;
- vocab.special_sep_id = LLAMA_TOKEN_NULL;
- vocab.special_pad_id = LLAMA_TOKEN_NULL;
- } else {
- throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
- }
+ const int n_layer = hparams.n_layer;
- // for now, only BPE models have pre-tokenizers
- if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
- vocab.tokenizer_add_space_prefix = false;
- vocab.tokenizer_clean_spaces = true;
- if (tokenizer_pre.empty()) {
- LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
- LLAMA_LOG_WARN("%s: \n", __func__);
- LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
- LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
- LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
- LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
- LLAMA_LOG_WARN("%s: \n", __func__);
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
- } else if (tokenizer_pre == "default") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
- } else if (
- tokenizer_pre == "llama3" ||
- tokenizer_pre == "llama-v3" ||
- tokenizer_pre == "llama-bpe"||
- tokenizer_pre == "falcon3") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
- vocab.tokenizer_ignore_merges = true;
- vocab.tokenizer_add_bos = true;
- } else if (
- tokenizer_pre == "deepseek-llm") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
- vocab.tokenizer_clean_spaces = false;
- } else if (
- tokenizer_pre == "deepseek-coder") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
- vocab.tokenizer_clean_spaces = false;
- } else if (
- tokenizer_pre == "deepseek-v3") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
- vocab.tokenizer_clean_spaces = false;
- } else if (
- tokenizer_pre == "falcon") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
- } else if (
- tokenizer_pre == "mpt") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
- } else if (
- tokenizer_pre == "starcoder") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
- } else if (
- tokenizer_pre == "gpt-2" ||
- tokenizer_pre == "phi-2" ||
- tokenizer_pre == "jina-es" ||
- tokenizer_pre == "jina-de" ||
- tokenizer_pre == "gigachat" ||
- tokenizer_pre == "jina-v1-en" ||
- tokenizer_pre == "jina-v2-es" ||
- tokenizer_pre == "jina-v2-de" ||
- tokenizer_pre == "jina-v2-code" ||
- tokenizer_pre == "roberta-bpe") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
- } else if (
- tokenizer_pre == "refact") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
- } else if (
- tokenizer_pre == "command-r") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
- vocab.tokenizer_clean_spaces = false;
- } else if (
- tokenizer_pre == "qwen2") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
- vocab.tokenizer_clean_spaces = false;
- } else if (
- tokenizer_pre == "stablelm2") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
- } else if (
- tokenizer_pre == "olmo") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
- } else if (
- tokenizer_pre == "dbrx") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
- } else if (
- tokenizer_pre == "smaug-bpe") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
- } else if (
- tokenizer_pre == "poro-chat") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
- vocab.tokenizer_clean_spaces = false;
- } else if (
- tokenizer_pre == "chatglm-bpe") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
- vocab.special_bos_id = LLAMA_TOKEN_NULL;
- } else if (
- tokenizer_pre == "viking") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
- vocab.tokenizer_clean_spaces = false;
- } else if (
- tokenizer_pre == "jais") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
- } else if (
- tokenizer_pre == "tekken") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
- vocab.tokenizer_clean_spaces = false;
- vocab.tokenizer_ignore_merges = true;
- vocab.tokenizer_add_bos = true;
- } else if (
- tokenizer_pre == "smollm") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
- vocab.tokenizer_clean_spaces = false;
- } else if (
- tokenizer_pre == "codeshell") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
- } else if (
- tokenizer_pre == "bloom") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_BLOOM;
- } else if (
- tokenizer_pre == "gpt3-finnish") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
- } else if (
- tokenizer_pre == "exaone") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
- } else if (
- tokenizer_pre == "chameleon") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
- vocab.tokenizer_add_bos = true;
- vocab.tokenizer_clean_spaces = false;
- } else if (
- tokenizer_pre == "minerva-7b") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
- } else if (
- tokenizer_pre == "megrez") {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
- } else {
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
- }
- } else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
- vocab.tokenizer_add_space_prefix = true;
- vocab.tokenizer_clean_spaces = false;
- vocab.tokenizer_add_bos = true;
- vocab.tokenizer_add_eos = false;
- } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
- vocab.tokenizer_add_space_prefix = false;
- vocab.tokenizer_clean_spaces = true;
- vocab.tokenizer_add_bos = true;
- vocab.tokenizer_add_eos = false;
- } else if (vocab.type == LLAMA_VOCAB_TYPE_UGM) {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
- vocab.tokenizer_add_bos = false;
- vocab.tokenizer_add_eos = true;
- } else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
- vocab.tokenizer_add_space_prefix = false;
- vocab.tokenizer_clean_spaces = false;
- vocab.tokenizer_add_bos = false;
- vocab.tokenizer_add_eos = false;
- } else {
- vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
- }
+ const bool use_mmap_buffer = true;
- ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, vocab.tokenizer_add_space_prefix, false);
- ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.tokenizer_remove_extra_whitespaces, false);
+ // build a list of buffer types for the CPU and GPU devices
+ pimpl->cpu_buft_list = make_cpu_buft_list(devices);
+ for (auto * dev : devices) {
+ buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
+ // add CPU buffer types as a fallback
+ buft_list.insert(buft_list.end(), pimpl->cpu_buft_list.begin(), pimpl->cpu_buft_list.end());
+ pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
}
- const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
- if (token_idx == -1) {
- throw std::runtime_error("cannot find tokenizer vocab in model file\n");
+ // calculate the split points
+ bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
+ std::vector<float> splits(n_devices());
+ if (all_zero) {
+ // default split, by free memory
+ for (size_t i = 0; i < n_devices(); ++i) {
+ ggml_backend_dev_t dev = devices[i];
+ size_t total;
+ size_t free;
+ ggml_backend_dev_memory(dev, &free, &total);
+ splits[i] = free;
+ }
+ } else {
+ std::copy(tensor_split, tensor_split + n_devices(), splits.begin());
}
- const float * scores = nullptr;
- const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
- if (score_idx != -1) {
- scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
+ // sum and normalize the splits to get the split points
+ float split_sum = 0.0f;
+ for (size_t i = 0; i < n_devices(); ++i) {
+ split_sum += splits[i];
+ splits[i] = split_sum;
}
-
- const int * toktypes = nullptr;
- const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
- if (toktype_idx != -1) {
- toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
+ for (size_t i = 0; i < n_devices(); ++i) {
+ splits[i] /= split_sum;
}
- const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
+ ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+ const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
+ const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
+ auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
+ if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
+ return {cpu_dev, &pimpl->cpu_buft_list};
+ }
+ const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
+ auto * dev = devices.at(layer_gpu);
+ return {dev, &pimpl->gpu_buft_list.at(dev)};
+ };
- vocab.n_vocab = n_vocab;
- vocab.id_to_token.resize(n_vocab);
+ // assign the input layer
+ // there is very little benefit to offloading the input layer, so always keep it on the CPU
+ pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };
- for (uint32_t i = 0; i < n_vocab; i++) {
- std::string word = gguf_get_arr_str(ctx, token_idx, i);
- if (word.empty()) {
- LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i);
- word = "[EMPTY_" + std::to_string(i) + "]";
- }
+ // assign the repeating layers to the devices according to the splits
+ pimpl->dev_layer.resize(n_layer);
+ for (int il = 0; il < n_layer; ++il) {
+ pimpl->dev_layer[il] = get_layer_buft_list(il);
+ }
- vocab.token_to_id[word] = i;
- vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
-
- auto & token_data = vocab.id_to_token[i];
- token_data.text = std::move(word);
- token_data.score = scores ? scores[i] : 0.0f;
- token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
-
- if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
- switch(toktypes[i]) {
- case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
- case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
- case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
- case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
- case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
- case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
- case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
- default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
+ // assign the output layer
+ pimpl->dev_output = get_layer_buft_list(n_layer);
+
+ // one ggml context per buffer type
+ int max_n_tensors = ml.n_tensors;
+ max_n_tensors += 1; // duplicated output tensor
+ max_n_tensors += n_layer*2; // duplicated rope freq tensors
+ const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
+
+ std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+ auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+ auto it = ctx_map.find(buft);
+ if (it == ctx_map.end()) {
+ ggml_init_params params = {
+ /*.mem_size =*/ ctx_size,
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ true,
+ };
+
+ ggml_context * ctx = ggml_init(params);
+ if (!ctx) {
+ throw std::runtime_error(format("failed to create ggml context"));
}
- }
- }
- GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
- vocab.init_tokenizer();
+ ctx_map[buft] = ctx;
+ pimpl->ctxs.emplace_back(ctx);
- // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
- if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
- try {
- vocab.linefeed_id = llama_byte_to_token_impl(vocab, '\n');
- } catch (const std::exception & e) {
- LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
- vocab.linefeed_id = vocab.special_pad_id;
+ return ctx;
}
- } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
- vocab.linefeed_id = vocab.special_pad_id;
- } else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) {
- const std::vector<int> ids = llama_tokenize_internal(vocab, "\n", false);
- GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
- vocab.linefeed_id = ids[0];
- } else {
- const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
+ return it->second;
+ };
- //GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
- if (ids.empty()) {
- LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__);
- vocab.linefeed_id = vocab.special_pad_id;
- } else {
- vocab.linefeed_id = ids[0];
- }
- }
+ const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
+ const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
- // special tokens
+ // create tensors for the weights
{
- const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
- { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
- { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
- { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
- { LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id },
- { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
- { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
- { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
- { LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
- { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
- { LLM_KV_TOKENIZER_FIM_PRE_ID, vocab.special_fim_pre_id },
- { LLM_KV_TOKENIZER_FIM_SUF_ID, vocab.special_fim_suf_id },
- { LLM_KV_TOKENIZER_FIM_MID_ID, vocab.special_fim_mid_id },
- { LLM_KV_TOKENIZER_FIM_PAD_ID, vocab.special_fim_pad_id },
- { LLM_KV_TOKENIZER_FIM_REP_ID, vocab.special_fim_rep_id },
- { LLM_KV_TOKENIZER_FIM_SEP_ID, vocab.special_fim_sep_id },
-
- // deprecated
- { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_fim_pre_id },
- { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_fim_suf_id },
- { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_fim_mid_id },
- };
+ // note: cast to int64_t since we will use these for the tensor dimensions
+ const int64_t n_head = hparams.n_head();
+ const int64_t n_head_kv = hparams.n_head_kv();
+ const int64_t n_embd = hparams.n_embd;
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
+ const int64_t n_embd_head_v = hparams.n_embd_head_v;
+ const int64_t n_ff = hparams.n_ff();
+ const int64_t n_embd_gqa = n_embd_v_gqa;
+ const int64_t n_vocab = vocab.n_tokens();
+ const int64_t n_token_types = vocab.n_token_types();
+ const int64_t n_rot = hparams.n_rot;
+ const int64_t n_expert = hparams.n_expert;
+ const int64_t n_expert_used = hparams.n_expert_used;
+ const int64_t n_ctx_train = hparams.n_ctx_train;
+
+ if (n_expert > 0 && hparams.n_expert_used == 0) {
+ throw std::runtime_error("model has expert layers but no expert layers are used");
+ }
- for (const auto & it : special_token_types) {
- const std::string & key = kv(std::get<0>(it));
- int32_t & id = std::get<1>(it);
+ int n_moved_tensors = 0;
+ ggml_tensor * first_moved_tensor = nullptr;
+ ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
+ ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
- uint32_t new_id;
- if (!ml.get_key(std::get<0>(it), new_id, false)) {
- continue;
- }
- if (new_id >= vocab.id_to_token.size()) {
- LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n",
- __func__, key.c_str(), new_id, id);
- } else {
- id = new_id;
+ auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
+ ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
+
+ if (!t_meta) {
+ if (flags & TENSOR_NOT_REQUIRED) {
+ return nullptr;
+ }
+ throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
}
- }
- // Handle add_bos_token and add_eos_token
- {
- bool temp = true;
+ // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
+ // the tensor is duplicated
+ // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
+ llm_tensor tn_tensor = tn.tensor;
+ if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & TENSOR_DUPLICATED) {
+ tn_tensor = LLM_TENSOR_OUTPUT;
+ }
- if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
- vocab.tokenizer_add_bos = temp;
+ llm_tensor_info info;
+ try {
+ info = llm_tensor_info_for(tn_tensor);
+ } catch (const std::out_of_range & e) {
+ throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
}
- if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
- vocab.tokenizer_add_eos = temp;
+
+ // tensors with "bias" suffix are always used with GGML_OP_ADD
+ ggml_op op;
+ bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
+ if (bias) {
+ op = GGML_OP_ADD;
+ } else {
+ op = info.op;
}
- }
- // auto-detect special tokens by text
- // TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
- // for now, we apply this workaround to find the tokens based on their text
-
- for (const auto & t : vocab.token_to_id) {
- // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
- if (vocab.special_eot_id == LLAMA_TOKEN_NULL) {
- if (false
- || t.first == "<|eot_id|>"
- || t.first == "<|im_end|>"
- || t.first == "<|end|>"
- || t.first == "<end_of_turn>"
- || t.first == "<|endoftext|>"
- || t.first == "<EOT>"
- || t.first == "<|end▁of▁sentence|>" // DeepSeek
- ) {
- vocab.special_eot_id = t.second;
- if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
- LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
- __func__, t.second, t.first.c_str());
- vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
- }
+ // sanity checks
+ if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
+ if (tn.bid != -1) {
+ GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
+ }
+ } else {
+ if (tn.bid == -1) {
+ GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
}
}
- // find EOM token: "<|eom_id|>"
- if (vocab.special_eom_id == LLAMA_TOKEN_NULL) {
- if (false
- || t.first == "<|eom_id|>"
- ) {
- vocab.special_eom_id = t.second;
- if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
- LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
- __func__, t.second, t.first.c_str());
- vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
- }
- }
+ // select the buffer type for this tensor
+ buft_list_t * buft_list;
+ switch (info.layer) {
+ case LLM_TENSOR_LAYER_INPUT:
+ buft_list = pimpl->dev_input.buft_list;
+ break;
+ case LLM_TENSOR_LAYER_OUTPUT:
+ buft_list = pimpl->dev_output.buft_list;
+ break;
+ case LLM_TENSOR_LAYER_REPEATING:
+ buft_list = pimpl->dev_layer.at(tn.bid).buft_list;
+ break;
+ default:
+ GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
}
- // find FIM_PRE token: "<|fim_prefix|>", "<fim-prefix>", "<PRE>", etc.
- if (vocab.special_fim_pre_id == LLAMA_TOKEN_NULL) {
- if (false
- || t.first == "<|fim_prefix|>" // Qwen
- || t.first == "<fim-prefix>"
- || t.first == "<|fim▁begin|>" // DeepSeek
- || t.first == "<PRE>"
- ) {
- vocab.special_fim_pre_id = t.second;
- if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
- LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
- __func__, t.second, t.first.c_str());
- vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
- }
- }
+ ggml_backend_buffer_type_t buft = select_weight_buft(hparams, t_meta, op, *buft_list);
+ if (!buft) {
+ throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
}
- // find FIM_SUF token: "<|fim_suffix|>", "<fim-suffix>", "<SUF>", etc.
- if (vocab.special_fim_suf_id == LLAMA_TOKEN_NULL) {
- if (false
- || t.first == "<|fim_suffix|>" // Qwen
- || t.first == "<fim-suffix>"
- || t.first == "<|fim▁hole|>" // DeepSeek
- || t.first == "<SUF>"
- ) {
- vocab.special_fim_suf_id = t.second;
- if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
- LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
- __func__, t.second, t.first.c_str());
- vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
- }
- }
+ // avoid using a host buffer when using mmap
+ auto * buft_dev = ggml_backend_buft_get_device(buft);
+ if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+ buft = ggml_backend_dev_buffer_type(cpu_dev);
}
- // find FIM_MID token: "<|fim_middle|>", "<fim-middle>", "<MID>", etc.
- if (vocab.special_fim_mid_id == LLAMA_TOKEN_NULL) {
- if (false
- || t.first == "<|fim_middle|>" // Qwen
- || t.first == "<fim-middle>"
- || t.first == "<|fim▁end|>" // DeepSeek
- || t.first == "<MID>"
- ) {
- vocab.special_fim_mid_id = t.second;
- if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
- LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
- __func__, t.second, t.first.c_str());
- vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
- }
+ if (buft != buft_list->front().second) {
+ n_moved_tensors++;
+ if (!first_moved_tensor) {
+ first_moved_tensor = t_meta;
+ first_moved_from_buft = buft_list->front().second;
+ first_moved_to_buft = buft;
}
}
- // find FIM_PAD token: "<|fim_pad|>", "<fim-pad>", "<PAD>", etc.
- if (vocab.special_fim_pad_id == LLAMA_TOKEN_NULL) {
- if (false
- || t.first == "<|fim_pad|>" // Qwen
- || t.first == "<fim-pad>"
- || t.first == "<PAD>"
- ) {
- vocab.special_fim_pad_id = t.second;
- if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
- LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
- __func__, t.second, t.first.c_str());
- vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
- }
+ ggml_context * ctx = ctx_for_buft(buft);
+
+ // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
+ if (flags & TENSOR_DUPLICATED) {
+ ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
+ if (t) {
+ return t;
}
}
+ return ml.create_tensor(ctx, tn, ne, flags);
+ };
- // find FIM_REP token: "<|fim_repo|>", "<fim-repo>", "<REP>", etc.
- if (vocab.special_fim_rep_id == LLAMA_TOKEN_NULL) {
- if (false
- || t.first == "<|fim_repo|>" // Qwen
- || t.first == "<|repo_name|>"
- || t.first == "<fim-repo>"
- || t.first == "<REPO>"
- ) {
- vocab.special_fim_rep_id = t.second;
- if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
- LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
- __func__, t.second, t.first.c_str());
- vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+ layers.resize(n_layer);
+
+ // TODO: move to a separate function
+ const auto tn = LLM_TN(arch);
+ switch (arch) {
+ case LLM_ARCH_LLAMA:
+ case LLM_ARCH_REFACT:
+ case LLM_ARCH_MINICPM:
+ case LLM_ARCH_GRANITE:
+ case LLM_ARCH_GRANITE_MOE:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
}
- }
- }
- // find FIM_SEP token: "<|file_sep|>"
- if (vocab.special_fim_sep_id == LLAMA_TOKEN_NULL) {
- if (false
- || t.first == "<|file_sep|>" // Qwen
- ) {
- vocab.special_fim_sep_id = t.second;
- if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
- LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
- __func__, t.second, t.first.c_str());
- vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+ // optional bias tensors
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ }
+ else {
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ }
+
+ if (n_expert == 0) {
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+
+ // optional MLP bias
+ layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+ } else {
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
+ }
+ }
+ } break;
+ case LLM_ARCH_DECI:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
}
- }
- }
- }
- // maintain a list of tokens that cause end-of-generation
- // this is currently determined based on the token text, which is obviously not ideal
- // ref: https://github.com/ggerganov/llama.cpp/issues/9606
- vocab.special_eog_ids.clear();
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
+ const int64_t n_ff = hparams.n_ff(i);
+ const int64_t n_head = hparams.n_head(i);
+ const int64_t n_head_kv = hparams.n_head_kv(i);
+
+ if (n_head_kv == 0 && n_head > 0) {
+ // linear attention for DeciLMCausalModel
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ }
+ else if (n_head_kv > 0) {
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+ }
+
+ // optional bias tensors
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ }
+ else {
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ }
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+
+ // optional MLP bias
+ layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+ }
+ } break;
+ case LLM_ARCH_MINICPM3:
+ {
+ const int64_t n_embd_head_qk_rope = hparams.n_rot;
+ const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+
+ const int64_t q_lora_rank = hparams.n_lora_q;
+ const int64_t kv_lora_rank = hparams.n_lora_kv;
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
- if (vocab.special_fim_pad_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_pad_id) == 0) {
- vocab.special_eog_ids.insert(vocab.special_fim_pad_id);
- }
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
- if (vocab.special_fim_rep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_rep_id) == 0) {
- vocab.special_eog_ids.insert(vocab.special_fim_rep_id);
- }
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
- if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_sep_id) == 0) {
- vocab.special_eog_ids.insert(vocab.special_fim_sep_id);
- }
+ layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
- for (const auto & t : vocab.token_to_id) {
- if (false
- || t.first == "<|eot_id|>"
- || t.first == "<|im_end|>"
- || t.first == "<|end|>"
- || t.first == "<end_of_turn>"
- || t.first == "<|endoftext|>"
- || t.first == "<|eom_id|>"
- || t.first == "<EOT>"
- ) {
- vocab.special_eog_ids.insert(t.second);
- if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
- LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
- __func__, t.second, t.first.c_str());
- vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
- }
- } else {
- // token is control, but not marked as EOG -> print a debug log
- if (vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && vocab.special_eog_ids.count(t.second) == 0) {
- LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
- __func__, t.second, t.first.c_str());
- }
- }
- }
+ layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
+ layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
- // sanity checks
- if (vocab.special_eos_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
- vocab.special_eog_ids.insert(vocab.special_eos_id);
- LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
- }
+ layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
+ layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
- if (vocab.special_eot_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
- vocab.special_eog_ids.insert(vocab.special_eot_id);
- LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
- }
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- if (vocab.special_eom_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
- vocab.special_eog_ids.insert(vocab.special_eom_id);
- LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
- }
- }
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- // build special tokens cache
- {
- for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
- if (vocab.id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) {
- vocab.cache_special_tokens.push_back(id);
- }
- }
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ }
+ } break;
+ case LLM_ARCH_GROK:
+ {
+ if (n_expert == 0) {
+ throw std::runtime_error("Grok model cannot have zero experts");
+ }
- std::sort(vocab.cache_special_tokens.begin(), vocab.cache_special_tokens.end(),
- [&] (const llama_vocab::id a, const llama_vocab::id b) {
- return vocab.id_to_token[a].text.size() > vocab.id_to_token[b].text.size();
- }
- );
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
- }
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
- // build token to piece cache
- {
- size_t size_cache = 0;
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
- std::vector<llama_vocab::token> cache_token_to_piece(n_vocab);
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
- for (uint32_t id = 0; id < n_vocab; ++id) {
- cache_token_to_piece[id] = llama_token_to_piece(&model, id, true);
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- size_cache += cache_token_to_piece[id].size();
- }
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
+ layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
- LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
- }
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- // Handle per token attributes
- //NOTE: Each model customizes per token attributes.
- //NOTE: Per token attributes are missing from the GGUF file.
- //TODO: Extract attributes from GGUF file.
- {
- auto _contains_any = [] (const std::string &str, const std::vector<std::string> &substrs) -> bool {
- for (auto substr : substrs) {
- if (str.find(substr) < std::string::npos) {
- return true;
- }
- }
- return false;
- };
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
- auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) {
- uint32_t current = vocab.id_to_token.at(id).attr;
- current = value ? (current | attr) : (current & ~attr);
- vocab.id_to_token[id].attr = (llama_token_attr) current;
- };
+ layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
+ }
+ } break;
+ case LLM_ARCH_DBRX:
+ {
+ if (n_expert == 0) {
+ throw std::runtime_error("DBRX model cannot have zero experts");
+ }
- auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
- _set_tokenid_attr(vocab.token_to_id.at(token), attr, value);
- };
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- std::string model_name;
- std::string tokenizer_pre;
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
- ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
- ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
- // model name to lowercase
- std::transform(model_name.begin(), model_name.end(), model_name.begin(),
- [] (const std::string::value_type x) {
- return std::tolower(x);
- }
- );
-
- // set attributes by model/tokenizer name
- if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
- _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
- } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
- for (auto id : vocab.cache_special_tokens) {
- _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
- }
- for (auto token : {"</s>"}) {
- _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
- }
- for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
- _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
- }
- }
- }
-}
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
- const auto & hparams = model.hparams;
- const auto & vocab = model.vocab;
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
+ layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
- auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
- bool is_var = false;
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
+ }
+ } break;
+ case LLM_ARCH_BAICHUAN:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+ {
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+ }
- std::vector<uint32_t> v;
- for (uint32_t i = 0; i < n; ++i) {
- v.push_back(f(i));
- if (v[i] != v[0]) {
- is_var = true;
- }
- }
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
- std::stringstream ss;
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- if (is_var) {
- ss << "[";
- for (uint32_t i = 0; i < n; ++i) {
- ss << v[i];
- if (i < n - 1) {
- ss << ", ";
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_FALCON:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ {
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ if (!output) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
+ }
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_STARCODER:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+ pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
+
+ // output
+ {
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ if (!output) {
+ // needs to be on GPU
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
+
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_BERT:
+ case LLM_ARCH_NOMIC_BERT:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+ type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
+
+ if (arch == LLM_ARCH_BERT) {
+ pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
+
+ cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
+ cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
+ cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {1}, TENSOR_NOT_REQUIRED);
+ }
+
+ tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
+ tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ if (arch == LLM_ARCH_BERT) {
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
+
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
+
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
+ } else {
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+ }
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
+
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+
+ if (arch == LLM_ARCH_BERT) {
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
+ } else {
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ }
+
+ layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
+ layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
+ }
+ } break;
+ case LLM_ARCH_JINA_BERT_V2:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
+ type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); // token_type_embeddings
+
+ tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); // LayerNorm
+ tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0); //LayerNorm bias
+
+ cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
+ cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {1}, TENSOR_NOT_REQUIRED);
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i]; // JinaBertLayer
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
+
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
+
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens
+
+ layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
+ layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
+
+ layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
+
+ layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
+ layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
+ }
+ } break;
+ case LLM_ARCH_BLOOM:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+ tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
+ tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
+
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_MPT:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+ pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, TENSOR_NOT_REQUIRED);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ if (!output) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ // AWQ ScaleActivation layer
+ layer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, TENSOR_NOT_REQUIRED);
+ }
+ } break;
+ case LLM_ARCH_STABLELM:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ // optional bias tensors, present in Stable LM 2 1.6B
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+
+ // optional q and k layernorms, present in StableLM 2 12B
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
+
+ // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_QWEN:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2}, 0);
+ }
+ } break;
+ case LLM_ARCH_QWEN2:
+ case LLM_ARCH_QWEN2VL:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ // optional bias tensors
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_QWEN2MOE:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ // optional bias tensors
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+
+ if (n_expert == 0) {
+ throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");
+ }
+ if (n_expert_used == 0) {
+ throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");
+ }
+
+ // MoE branch
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
+
+ // Shared expert branch
+ const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
+
+ layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
+ }
+ } break;
+ case LLM_ARCH_PHI2:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+ output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
+
+ if (layer.wqkv == nullptr) {
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
+
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
+
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
+ }
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
+
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_PHI3:
+ {
+ const int64_t n_embd_head = n_embd / n_head;
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
+
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
+
+ layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ }
+ } break;
+ case LLM_ARCH_PLAMO:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_GPT2:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+ pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
+
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_CODESHELL:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
+
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_ORION:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_INTERNLM2:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ // layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_GEMMA:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ }
+ } break;
+ case LLM_ARCH_GEMMA2:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
+ }
+ } break;
+ case LLM_ARCH_STARCODER2:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ // optional bias tensors
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+
+ // optional bias tensors
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_MAMBA:
+ {
+ const int64_t d_conv = hparams.ssm_d_conv;
+ const int64_t d_inner = hparams.ssm_d_inner;
+ const int64_t d_state = hparams.ssm_d_state;
+ const int64_t dt_rank = hparams.ssm_dt_rank;
+
+ // only an expansion factor of 2 is supported for now
+ if (2 * n_embd != d_inner) {
+ throw std::runtime_error("only an expansion factor of 2 is supported for now");
+ }
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed, duplicated to allow offloading
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ // norm
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
+
+ layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
+ layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
+
+ layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
+
+ layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
+ layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
+
+ // no "weight" suffix for these
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
+ layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
+
+ // out_proj
+ layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
+ }
+ } break;
+ case LLM_ARCH_XVERSE:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_COMMAND_R:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ // init output from the input tok embed
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ if (n_layer >= 64){
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
+ }
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_COHERE2:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
+ // init output from the input tok embed
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
+ TENSOR_DUPLICATED);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
+ }
+ }
+ break;
+ case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_OLMO2:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
+ }
+ } break;
+ case LLM_ARCH_OLMOE:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+
+ if (n_expert == 0) {
+ throw std::runtime_error("n_expert must be > 0");
+ }
+ if (n_expert_used == 0) {
+ throw std::runtime_error("n_expert_used must be > 0");
+ }
+
+ // MoE branch
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
+ }
+ } break;
+ case LLM_ARCH_OPENELM:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ // init output from the input tok embed
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+
+ for (int i = 0; i < n_layer; ++i) {
+ const int64_t n_head = hparams.n_head(i);
+ const int64_t n_head_qkv = 2*hparams.n_head_kv(i) + n_head;
+ const int64_t n_ff = hparams.n_ff(i);
+
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_GPTNEOX:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
+
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_ARCTIC:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+ layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
+ }
+ } break;
+ case LLM_ARCH_DEEPSEEK:
+ {
+
+ const int64_t n_ff_exp = hparams.n_ff_exp;
+ const int64_t n_expert_shared = hparams.n_expert_shared;
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ if (i < (int) hparams.n_layer_dense_lead) {
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ } else {
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+
+ if (n_expert == 0) {
+ throw std::runtime_error("n_expert must be > 0");
+ }
+ if (n_expert_used == 0) {
+ throw std::runtime_error("n_expert_used must be > 0");
+ }
+
+ // MoE branch
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
+
+ // Shared expert branch
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+ }
+ }
+ } break;
+ case LLM_ARCH_DEEPSEEK2:
+ {
+ const bool is_lite = (hparams.n_layer == 27);
+
+ const int64_t n_embd_head_qk_rope = hparams.n_rot;
+ const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+
+ const int64_t q_lora_rank = hparams.n_lora_q;
+ const int64_t kv_lora_rank = hparams.n_lora_kv;
+
+ const int64_t n_ff_exp = hparams.n_ff_exp;
+ const int64_t n_expert_shared = hparams.n_expert_shared;
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ if (!is_lite) {
+ layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
+ }
+
+ layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
+
+ if (!is_lite) {
+ layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
+ layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
+ } else {
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ }
+
+ layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
+ layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ if (i < (int) hparams.n_layer_dense_lead) {
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ } else {
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
+
+ if (n_expert == 0) {
+ throw std::runtime_error("n_expert must be > 0");
+ }
+ if (n_expert_used == 0) {
+ throw std::runtime_error("n_expert_used must be > 0");
+ }
+
+ // MoE branch
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
+
+ // Shared expert branch
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
+ }
+ }
+ } break;
+ case LLM_ARCH_BITNET:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up_scale = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+ }
+ } break;
+ case LLM_ARCH_T5:
+ {
+ const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
+
+ layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
+
+ layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
+
+ layer.attn_norm_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0);
+ // this tensor seems to be unused in HF transformers implementation
+ layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
+
+ layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_T5ENCODER:
+ {
+ const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
+
+ layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
+
+ layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_JAIS:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, 0);
+
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_CHATGLM:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
+ layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
+
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
+
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ }
+ } break;
+ case LLM_ARCH_NEMOTRON:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ // optional bias tensors
+ layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+ layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+
+ // optional MLP bias
+ layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+ }
+ } break;
+ case LLM_ARCH_EXAONE:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_RWKV6:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // Block 0, LN0
+ tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
+ tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ const int time_mix_extra_dim = hparams.time_mix_extra_dim;
+ const int time_decay_extra_dim = hparams.time_decay_extra_dim;
+ const int head_size = hparams.wkv_head_size;
+ const int attn_hidden_size = n_embd;
+ const int ffn_size = hparams.n_ff_arr[0];
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
+
+ layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
+ layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
+
+ layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
+ layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
+
+ layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
+ layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));
+
+ layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
+ layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
+ layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
+ layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
+ layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
+ layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
+ layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
+ layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
+
+ layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
+ layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
+ layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
+
+ layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
+ layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
+
+ layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
+ layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
+ layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);
+ }
+
+ } break;
+ case LLM_ARCH_RWKV6QWEN2:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ const int time_mix_extra_dim = hparams.time_mix_extra_dim;
+ const int time_decay_extra_dim = hparams.time_decay_extra_dim;
+ const int head_size = hparams.wkv_head_size;
+ const int attn_hidden_size = n_embd;
+ const int n_head_kv = hparams.n_head_kv();
+ int attn_key_value_size;
+ if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {
+ attn_key_value_size = attn_hidden_size;
+ } else {
+ attn_key_value_size = n_head_kv * head_size;
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
+ layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
+
+ layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
+ layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
+
+ layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
+ layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
+ layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
+ layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {n_embd, attn_key_value_size}, 0);
+ layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {n_embd, attn_key_value_size}, 0);
+ layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
+ layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
+ // optional bias tensors
+ layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
+
+ layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_CHAMELEON:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
+ layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
+ layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
+ }
+ } break;
+ case LLM_ARCH_WAVTOKENIZER_DEC:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
+
+ conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
+ conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
+
+ // posnet
+ {
+ const int64_t n_embd = hparams.posnet.n_embd;
+
+ for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
+ auto & layer = layers[i].posnet;
+
+ // posnet:
+ //
+ // - resnet
+ // - resnet
+ // - attn
+ // - resnet
+ // - resnet
+ // - norm
+ //
+ switch (i) {
+ case 0:
+ case 1:
+ case 3:
+ case 4:
+ {
+ layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
+ layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0);
+
+ layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
+ layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0);
+
+ layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
+ layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0);
+
+ layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
+ layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0);
+ } break;
+ case 2:
+ {
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
+ layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
+
+ layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0);
+ layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0);
+
+ layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0);
+ layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0);
+
+ layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0);
+ layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0);
+
+ layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0);
+ layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0);
+ } break;
+ case 5:
+ {
+ layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
+ layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
+ } break;
+ default: GGML_ABORT("unknown posnet layer");
+ };
+ }
+ }
+
+ GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
+
+ tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
+ tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0);
+
+ // convnext
+ {
+ const int64_t n_embd = hparams.convnext.n_embd;
+
+ for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
+ auto & layer = layers[i].convnext;
+
+ layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0);
+ layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0);
+
+ layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0);
+ layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0);
+
+ layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0);
+ layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
+
+ layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0);
+ layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0);
+
+ layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
+ }
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
+ }
+
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
+ output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
+ } break;
+ default:
+ throw std::runtime_error("unknown architecture");
+ }
+
+ if (n_moved_tensors > 0) {
+ LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
+ __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
+ ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
+ }
+ }
+
+ ml.done_getting_tensors();
+
+ ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
+ pimpl->mappings.reserve(ml.mappings.size());
+
+ // create the backend buffers
+ std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
+ ctx_bufs.reserve(ctx_map.size());
+
+ // Ensure we have enough capacity for the maximum backend buffer we will potentially create
+ const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
+ pimpl->bufs.reserve(n_max_backend_buffer);
+
+ for (auto & it : ctx_map) {
+ ggml_backend_buffer_type_t buft = it.first;
+ ggml_context * ctx = it.second;
+
+ // skip contexts without tensors
+ if (ggml_get_first_tensor(ctx) == nullptr) {
+ continue;
+ }
+
+ llama_buf_map buf_map;
+ buf_map.reserve(n_max_backend_buffer);
+
+ // check if it is possible to use buffer_from_host_ptr with this buffer type
+ ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
+ if (!dev) {
+ // FIXME: workaround for CPU backend buft having a NULL device
+ dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+ }
+ ggml_backend_dev_props props;
+ ggml_backend_dev_get_props(dev, &props);
+ bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
+ bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
+
+ if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
+ for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
+ // only the mmap region containing the tensors in the model is mapped to the backend buffer
+ // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
+ // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
+ void * addr = nullptr;
+ size_t first, last; // NOLINT
+ ml.get_mapping_range(&first, &last, &addr, idx, ctx);
+ if (first >= last) {
+ continue;
+ }
+ const size_t max_size = ggml_get_max_tensor_size(ctx);
+ ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
+ if (buf == nullptr) {
+ throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
+ }
+ pimpl->bufs.emplace_back(buf);
+ buf_map.emplace(idx, buf);
+ }
+ }
+ else {
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+ if (buf == nullptr) {
+ throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
+ }
+ pimpl->bufs.emplace_back(buf);
+ if (use_mlock && ggml_backend_buffer_is_host(buf)) {
+ pimpl->mlock_bufs.emplace_back(new llama_mlock);
+ auto & mlock_buf = pimpl->mlock_bufs.back();
+ mlock_buf->init (ggml_backend_buffer_get_base(buf));
+ mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
+ }
+ for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
+ buf_map.emplace(idx, buf);
+ }
+ }
+
+ if (pimpl->bufs.empty()) {
+ throw std::runtime_error("failed to allocate buffer");
+ }
+
+ for (auto & buf : buf_map) {
+ // indicate that this buffer contains weights
+ // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
+ ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+ }
+
+ ctx_bufs.emplace_back(ctx, buf_map);
+ }
+
+ if (llama_supports_gpu_offload()) {
+ const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
+
+ LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
+ if (n_gpu_layers > (int) hparams.n_layer) {
+ LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
+ }
+
+ const int max_backend_supported_layers = hparams.n_layer + 1;
+ const int max_offloadable_layers = hparams.n_layer + 1;
+
+ LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
+ }
+
+ // print memory requirements per buffer type
+ for (auto & buf : pimpl->bufs) {
+ LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
+ }
+
+ // populate tensors_by_name
+ for (auto & ctx : pimpl->ctxs) {
+ for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
+ tensors_by_name.emplace_back(ggml_get_name(cur), cur);
+ }
+ }
+
+ // load tensor data
+ for (auto & it : ctx_bufs) {
+ ggml_context * ctx = it.first;
+ auto & bufs = it.second;
+ if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
+ return false;
+ }
+ }
+
+ if (use_mmap_buffer) {
+ for (auto & mapping : ml.mappings) {
+ pimpl->mappings.emplace_back(std::move(mapping));
+ }
+ }
+
+ return true;
+}
+
+std::string llama_model::arch_name() const {
+ return llm_arch_name(arch);
+}
+
+std::string llama_model::type_name() const {
+ return llm_type_name(type);
+}
+
+std::string llama_model::desc() const {
+ return pimpl->desc_str;
+}
+
+size_t llama_model::size() const {
+ return pimpl->n_bytes;
+}
+
+size_t llama_model::max_nodes() const {
+ return std::max<size_t>(8192, tensors_by_name.size()*5);
+}
+
+size_t llama_model::n_devices() const {
+ return devices.size();
+}
+
+uint64_t llama_model::n_elements() const {
+ return pimpl->n_elements;
+}
+
+void llama_model::print_info() const {
+ const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
+
+ auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
+ bool is_var = false;
+
+ std::vector<uint32_t> v;
+ for (uint32_t i = 0; i < n; ++i) {
+ v.push_back(f(i));
+ if (v[i] != v[0]) {
+ is_var = true;
+ }
+ }
+
+ std::stringstream ss;
+
+ if (is_var) {
+ ss << "[";
+ for (uint32_t i = 0; i < n; ++i) {
+ ss << v[i];
+ if (i < n - 1) {
+ ss << ", ";
}
}
ss << "]";
};
// hparams
- LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
- LLAMA_LOG_INFO("%s: arch = %s\n", __func__, llm_arch_name(model.arch));
- LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type));
- LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
- LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
+ LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
if (!hparams.vocab_only) {
LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
}
- LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model).c_str());
- LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model).c_str());
- if (ml.n_elements >= 1e12) {
- LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, ml.n_elements*1e-12);
- } else if (ml.n_elements >= 1e9) {
- LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
- } else if (ml.n_elements >= 1e6) {
- LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, ml.n_elements*1e-6);
+ LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
+ if (pimpl->n_elements >= 1e12) {
+ LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
+ } else if (pimpl->n_elements >= 1e9) {
+ LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
+ } else if (pimpl->n_elements >= 1e6) {
+ LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
} else {
- LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, ml.n_elements*1e-3);
- }
- if (ml.n_bytes < GiB) {
- LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
- } else {
- LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
+ LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
}
// general kv
- LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
-
- // special tokens
- if (vocab.special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
- if (vocab.special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
- if (vocab.special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
- if (vocab.special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, vocab.special_eom_id, vocab.id_to_token[vocab.special_eom_id].text.c_str() ); }
- if (vocab.special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
- if (vocab.special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
- if (vocab.special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
- if (vocab.special_cls_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
- if (vocab.special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
-
- if (vocab.linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
-
- if (vocab.special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, vocab.special_fim_pre_id, vocab.id_to_token[vocab.special_fim_pre_id].text.c_str() ); }
- if (vocab.special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, vocab.special_fim_suf_id, vocab.id_to_token[vocab.special_fim_suf_id].text.c_str() ); }
- if (vocab.special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, vocab.special_fim_mid_id, vocab.id_to_token[vocab.special_fim_mid_id].text.c_str() ); }
- if (vocab.special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, vocab.special_fim_pad_id, vocab.id_to_token[vocab.special_fim_pad_id].text.c_str() ); }
- if (vocab.special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, vocab.special_fim_rep_id, vocab.id_to_token[vocab.special_fim_rep_id].text.c_str() ); }
- if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, vocab.special_fim_sep_id, vocab.id_to_token[vocab.special_fim_sep_id].text.c_str() ); }
-
- for (const auto & id : vocab.special_eog_ids) {
- LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() );
- }
-
- LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
+ LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
- if (model.arch == LLM_ARCH_DEEPSEEK) {
+ if (arch == LLM_ARCH_DEEPSEEK) {
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
}
- if (model.arch == LLM_ARCH_DEEPSEEK2) {
+ if (arch == LLM_ARCH_DEEPSEEK2) {
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
}
- if (model.arch == LLM_ARCH_QWEN2MOE) {
+ if (arch == LLM_ARCH_QWEN2MOE) {
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
}
- if (model.arch == LLM_ARCH_MINICPM || model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
+ if (arch == LLM_ARCH_MINICPM || arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE) {
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
}
+
+ vocab.print_info();
+}
+
+ggml_backend_dev_t llama_model::dev_layer(int il) const {
+ return pimpl->dev_layer.at(il).dev;
+}
+
+ggml_backend_dev_t llama_model::dev_output() const {
+ return pimpl->dev_output.dev;
+}
+
+template<typename F>
+static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
+ ggml_init_params params = {
+ /*.mem_size =*/ ggml_tensor_overhead()*8,
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ true,
+ };
+
+ ggml_context_ptr ctx { ggml_init(params) };
+ if (!ctx) {
+ throw std::runtime_error(format("failed to create ggml context"));
+ }
+
+ ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
+ ggml_tensor * op_tensor = fn(ctx.get());
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
+ if (op_tensor->src[i] != nullptr) {
+ assert(op_tensor->src[i]->buffer == nullptr);
+ op_tensor->src[i]->buffer = buf.get();
+ }
+ }
+
+ bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
+
+ return op_supported;
+}
+
+template<typename F>
+static ggml_backend_buffer_type_t select_buft(const buft_list_t & buft_list, const F & fn) {
+ for (const auto & cur : buft_list) {
+ ggml_backend_dev_t cur_dev = cur.first;
+ ggml_backend_buffer_type_t cur_buft = cur.second;
+ if (buft_supported(cur_buft, cur_dev, fn)) {
+ return cur_buft;
+ }
+ }
+
+ throw std::runtime_error(format("no suitable buffer type found"));
+}
+
+ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
+ return ::select_buft(
+ *pimpl->dev_layer.at(il).buft_list,
+ [&](ggml_context * ctx) {
+ ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
+ ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
+ return ggml_add(ctx, cur, layer_dir);
+ });
+}
+
+const struct ggml_tensor * llama_model::get_tensor(const char * name) const {
+ auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
+ [name](const std::pair<std::string, struct ggml_tensor *> & it) {
+ return it.first == name;
+ });
+ if (it == tensors_by_name.end()) {
+ return nullptr;
+ }
+
+ return it->second;
}
//
return result;
}
+const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model) {
+ return &model->vocab;
+}
+
void llama_free_model(struct llama_model * model) {
llama_model_free(model);
}
delete model;
}
-enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
- return model->vocab.type;
+int32_t llama_model_n_ctx_train(const struct llama_model * model) {
+ return model->hparams.n_ctx_train;
+}
+
+int32_t llama_model_n_embd(const struct llama_model * model) {
+ return model->hparams.n_embd;
+}
+
+int32_t llama_model_n_layer(const struct llama_model * model) {
+ return model->hparams.n_layer;
}
-int32_t llama_n_vocab(const struct llama_model * model) {
- return model->hparams.n_vocab;
+int32_t llama_model_n_head(const struct llama_model * model) {
+ return model->hparams.n_head();
}
+// deprecated
int32_t llama_n_ctx_train(const struct llama_model * model) {
- return model->hparams.n_ctx_train;
+ return llama_model_n_ctx_train(model);
}
+// deprecated
int32_t llama_n_embd(const struct llama_model * model) {
- return model->hparams.n_embd;
+ return llama_model_n_embd(model);
}
+// deprecated
int32_t llama_n_layer(const struct llama_model * model) {
- return model->hparams.n_layer;
+ return llama_model_n_layer(model);
}
+// deprecated
int32_t llama_n_head(const struct llama_model * model) {
- return model->hparams.n_head();
+ return llama_model_n_head(model);
}
-enum llama_rope_type llama_rope_type(const struct llama_model * model) {
+enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
switch (model->arch) {
// these models do not use RoPE
case LLM_ARCH_GPT2:
case LLM_ARCH_T5ENCODER:
case LLM_ARCH_JAIS:
case LLM_ARCH_RWKV6:
+ case LLM_ARCH_RWKV6QWEN2:
case LLM_ARCH_WAVTOKENIZER_DEC:
return LLAMA_ROPE_TYPE_NONE;
case LLM_ARCH_OLMOE:
case LLM_ARCH_PHI2:
case LLM_ARCH_PHI3:
+ case LLM_ARCH_PHIMOE:
case LLM_ARCH_GEMMA:
case LLM_ARCH_GEMMA2:
case LLM_ARCH_STARCODER2:
return LLAMA_ROPE_TYPE_NONE;
}
-float llama_rope_freq_scale_train(const struct llama_model * model) {
+float llama_model_rope_freq_scale_train(const struct llama_model * model) {
return model->hparams.rope_freq_scale_train;
}
}
int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
- return snprintf(buf, buf_size, "%s %s %s",
- llama_model_arch_name (*model).c_str(),
- llama_model_type_name (*model).c_str(),
- llama_model_ftype_name(*model).c_str());
+ return snprintf(buf, buf_size, "%s", model->desc().c_str());
}
uint64_t llama_model_size(const struct llama_model * model) {
- return model->n_bytes;
+ return model->size();
+}
+
+const char * llama_model_chat_template(const struct llama_model * model) {
+ const auto & it = model->gguf_kv.find(LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE));
+ if (it == model->gguf_kv.end()) {
+ return nullptr;
+ }
+
+ return it->second.c_str();
}
uint64_t llama_model_n_params(const struct llama_model * model) {
- return model->n_elements;
+ return model->n_elements();
}
bool llama_model_has_encoder(const struct llama_model * model) {
switch (model->arch) {
case LLM_ARCH_MAMBA: return true;
case LLM_ARCH_RWKV6: return true;
+ case LLM_ARCH_RWKV6QWEN2: return true;
default: return false;
}
}
#include "llama-arch.h"
#include "llama-hparams.h"
#include "llama-vocab.h"
-#include "llama-mmap.h"
-
-#include "ggml-cpp.h"
+#include <memory>
+#include <string>
+#include <unordered_map>
#include <vector>
+struct llama_model_loader;
+
// available models
-// TODO: this enum does not follow the enum naming convention
enum llm_type {
- MODEL_UNKNOWN,
- MODEL_14M,
- MODEL_17M,
- MODEL_22M,
- MODEL_33M,
- MODEL_60M,
- MODEL_70M,
- MODEL_80M,
- MODEL_109M,
- MODEL_137M,
- MODEL_160M,
- MODEL_220M,
- MODEL_250M,
- MODEL_270M,
- MODEL_335M,
- MODEL_410M,
- MODEL_450M,
- MODEL_770M,
- MODEL_780M,
- MODEL_0_5B,
- MODEL_1B,
- MODEL_1_3B,
- MODEL_1_4B,
- MODEL_1_5B,
- MODEL_1_6B,
- MODEL_2B,
- MODEL_2_8B,
- MODEL_3B,
- MODEL_4B,
- MODEL_6B,
- MODEL_6_9B,
- MODEL_7B,
- MODEL_8B,
- MODEL_9B,
- MODEL_11B,
- MODEL_12B,
- MODEL_13B,
- MODEL_14B,
- MODEL_15B,
- MODEL_16B,
- MODEL_20B,
- MODEL_30B,
- MODEL_32B,
- MODEL_34B,
- MODEL_35B,
- MODEL_40B,
- MODEL_65B,
- MODEL_70B,
- MODEL_236B,
- MODEL_314B,
- MODEL_671B,
- MODEL_SMALL,
- MODEL_MEDIUM,
- MODEL_LARGE,
- MODEL_XL,
- MODEL_A1_7B,
- MODEL_A2_7B,
- MODEL_8x7B,
- MODEL_8x22B,
- MODEL_16x12B,
- MODEL_10B_128x3_66B,
- MODEL_57B_A14B,
- MODEL_27B,
+ LLM_TYPE_UNKNOWN,
+ LLM_TYPE_14M,
+ LLM_TYPE_17M,
+ LLM_TYPE_22M,
+ LLM_TYPE_33M,
+ LLM_TYPE_60M,
+ LLM_TYPE_70M,
+ LLM_TYPE_80M,
+ LLM_TYPE_109M,
+ LLM_TYPE_137M,
+ LLM_TYPE_160M,
+ LLM_TYPE_220M,
+ LLM_TYPE_250M,
+ LLM_TYPE_270M,
+ LLM_TYPE_335M,
+ LLM_TYPE_410M,
+ LLM_TYPE_450M,
+ LLM_TYPE_770M,
+ LLM_TYPE_780M,
+ LLM_TYPE_0_5B,
+ LLM_TYPE_1B,
+ LLM_TYPE_1_3B,
+ LLM_TYPE_1_4B,
+ LLM_TYPE_1_5B,
+ LLM_TYPE_1_6B,
+ LLM_TYPE_2B,
+ LLM_TYPE_2_8B,
+ LLM_TYPE_3B,
+ LLM_TYPE_4B,
+ LLM_TYPE_6B,
+ LLM_TYPE_6_9B,
+ LLM_TYPE_7B,
+ LLM_TYPE_8B,
+ LLM_TYPE_9B,
+ LLM_TYPE_11B,
+ LLM_TYPE_12B,
+ LLM_TYPE_13B,
+ LLM_TYPE_14B,
+ LLM_TYPE_15B,
+ LLM_TYPE_16B,
+ LLM_TYPE_20B,
+ LLM_TYPE_30B,
+ LLM_TYPE_32B,
+ LLM_TYPE_34B,
+ LLM_TYPE_35B,
+ LLM_TYPE_40B,
+ LLM_TYPE_65B,
+ LLM_TYPE_70B,
+ LLM_TYPE_236B,
+ LLM_TYPE_314B,
+ LLM_TYPE_671B,
+ LLM_TYPE_SMALL,
+ LLM_TYPE_MEDIUM,
+ LLM_TYPE_LARGE,
+ LLM_TYPE_XL,
+ LLM_TYPE_A1_7B,
+ LLM_TYPE_A2_7B,
+ LLM_TYPE_8x7B,
+ LLM_TYPE_8x22B,
+ LLM_TYPE_16x12B,
+ LLM_TYPE_16x3_8B,
+ LLM_TYPE_10B_128x3_66B,
+ LLM_TYPE_57B_A14B,
+ LLM_TYPE_27B,
};
struct llama_layer_posnet {
struct ggml_tensor * time_mix_lerp_v = nullptr;
struct ggml_tensor * time_mix_lerp_r = nullptr;
struct ggml_tensor * time_mix_lerp_g = nullptr;
-
- struct ggml_tensor * time_mix_first = nullptr;
- struct ggml_tensor * time_mix_decay = nullptr;
- struct ggml_tensor * time_mix_decay_w1 = nullptr;
- struct ggml_tensor * time_mix_decay_w2 = nullptr;
- struct ggml_tensor * time_mix_key = nullptr;
- struct ggml_tensor * time_mix_value = nullptr;
- struct ggml_tensor * time_mix_receptance = nullptr;
- struct ggml_tensor * time_mix_gate = nullptr;
+ struct ggml_tensor * time_mix_lerp_fused = nullptr;
+
+ struct ggml_tensor * time_mix_first = nullptr;
+ struct ggml_tensor * time_mix_decay = nullptr;
+ struct ggml_tensor * time_mix_decay_w1 = nullptr;
+ struct ggml_tensor * time_mix_decay_w2 = nullptr;
+ struct ggml_tensor * time_mix_key = nullptr;
+ struct ggml_tensor * time_mix_key_b = nullptr;
+ struct ggml_tensor * time_mix_value = nullptr;
+ struct ggml_tensor * time_mix_value_b = nullptr;
+ struct ggml_tensor * time_mix_receptance = nullptr;
+ struct ggml_tensor * time_mix_receptance_b = nullptr;
+ struct ggml_tensor * time_mix_gate = nullptr;
struct ggml_tensor * time_mix_ln = nullptr;
struct ggml_tensor * time_mix_ln_b = nullptr;
};
struct llama_model {
- llm_type type = MODEL_UNKNOWN;
+ llm_type type = LLM_TYPE_UNKNOWN;
llm_arch arch = LLM_ARCH_UNKNOWN;
- llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
-
std::string name = "n/a";
llama_hparams hparams = {};
std::vector<llama_layer> layers;
+ llama_model_params params;
+
// gguf metadata
std::unordered_map<std::string, std::string> gguf_kv;
- llama_split_mode split_mode;
- int main_gpu;
- int n_gpu_layers;
-
std::vector<std::string> rpc_servers;
// list of devices used in this model
std::vector<ggml_backend_dev_t> devices;
-
- // lists of buffer types used for each layer
- using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
- buft_list_t cpu_buft_list;
- std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
-
- struct layer_dev {
- ggml_backend_dev_t dev;
- buft_list_t * buft_list;
- };
-
- layer_dev dev_input = {};
- layer_dev dev_output = {};
- std::vector<layer_dev> dev_layer;
-
- // contexts where the model tensors metadata is stored
- std::vector<ggml_context_ptr> ctxs;
-
- // the model memory buffers for the tensor data
- std::vector<ggml_backend_buffer_ptr> bufs;
-
- // model memory mapped files
- llama_mmaps mappings;
-
- // objects representing data potentially being locked in memory
- llama_mlocks mlock_bufs;
- llama_mlocks mlock_mmaps;
-
// for quantize-stats only
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
int64_t t_load_us = 0;
int64_t t_start_us = 0;
- // total number of parameters in the model
- uint64_t n_elements = 0;
+ explicit llama_model(const struct llama_model_params & params);
+ ~llama_model();
- // total size of all the tensors in the model in bytes
- size_t n_bytes = 0;
-};
+ void load_stats (llama_model_loader & ml);
+ void load_arch (llama_model_loader & ml);
+ void load_hparams(llama_model_loader & ml);
+ void load_vocab (llama_model_loader & ml);
+ bool load_tensors(llama_model_loader & ml); // returns false if cancelled by progress_callback
-const char * llm_type_name(llm_type type);
+ std::string arch_name() const;
+ std::string type_name() const;
+
+ std::string desc() const;
-std::string llama_model_arch_name (const llama_model & model);
-std::string llama_model_type_name (const llama_model & model);
-std::string llama_model_ftype_name(const llama_model & model);
+ size_t size() const;
+ size_t max_nodes() const;
+ size_t n_devices() const;
-// used by llama_adapter_cvec
-ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il);
+ // total number of parameters in the model
+ uint64_t n_elements() const;
-// used by llama_adapter_lora
-struct ggml_tensor * llama_model_get_tensor(const struct llama_model & model, const char * name);
+ void print_info() const;
-size_t llama_model_max_nodes(const llama_model & model);
+ ggml_backend_dev_t dev_layer(int il) const;
+ ggml_backend_dev_t dev_output() const;
-struct llama_model_loader;
+ ggml_backend_buffer_type_t select_buft(int il) const;
+
+ const struct ggml_tensor * get_tensor(const char * name) const;
-// TODO: become llama_model methods
-void llm_load_stats (llama_model_loader & ml, llama_model & model);
-void llm_load_arch (llama_model_loader & ml, llama_model & model);
-void llm_load_hparams (llama_model_loader & ml, llama_model & model);
-void llm_load_vocab (llama_model_loader & ml, llama_model & model);
-void llm_load_print_meta(llama_model_loader & ml, llama_model & model);
+private:
+ struct impl;
+ std::unique_ptr<impl> pimpl;
+};
+
+const char * llm_type_name(llm_type type);
#include <algorithm>
#include <cmath>
#include <cstring>
+#include <cinttypes>
#include <fstream>
#include <mutex>
#include <thread>
#include <unordered_map>
-// TODO: replace with ggml API call
-#define QK_K 256
-
static void zeros(std::ofstream & file, size_t n) {
char zero = 0;
for (size_t i = 0; i < n; ++i) {
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
new_type = qs.params->output_tensor_type;
} else {
- int nx = tensor->ne[0];
- if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
+ const int64_t nx = tensor->ne[0];
+ const int64_t qk_k = ggml_blck_size(new_type);
+
+ if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) {
new_type = GGML_TYPE_Q8_0;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
- if (qs.model.type == MODEL_70B) {
+ if (qs.model.type == LLM_TYPE_70B) {
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
// nearly negligible increase in model size by quantizing this tensor with more bits:
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
//}
bool convert_incompatible_tensor = false;
- if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
- new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
- new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
- new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
- new_type == GGML_TYPE_IQ1_M) {
- int nx = tensor->ne[0];
- int ny = tensor->ne[1];
- if (nx % QK_K != 0) {
- LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
+ {
+ const int64_t nx = tensor->ne[0];
+ const int64_t ny = tensor->ne[1];
+ const int64_t qk_k = ggml_blck_size(new_type);
+
+ if (nx % qk_k != 0) {
+ LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
convert_incompatible_tensor = true;
} else {
++qs.n_k_quantized;
}
}
+
if (convert_incompatible_tensor) {
switch (new_type) {
case GGML_TYPE_TQ1_0:
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
kv_overrides = v->data();
}
+
llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
ml.init_mappings(false); // no prefetching
- llama_model model;
- llm_load_arch (ml, model);
- llm_load_hparams(ml, model);
- llm_load_stats (ml, model);
+ llama_model model(llama_model_default_params());
+
+ model.load_arch (ml);
+ model.load_hparams(ml);
+ model.load_stats (ml);
struct quantize_state_impl qs(model, params);
if (params->only_copy) {
- ftype = model.ftype;
+ ftype = ml.ftype;
}
const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
if (params->imatrix) {
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
- // sanity checks
+ // sanity checks for models that have attention layers
+ if (qs.n_attention_wv != 0)
{
const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
// attention layers have a non-zero number of kv heads
quantize &= name.find("time_mix_w2.weight") == std::string::npos;
quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
+ quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
// do not quantize relative position bias (T5)
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
// update the gguf meta data as we go
gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
- gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data, new_size);
+ GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
+ gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
// write tensor data + padding
fout.write((const char *) new_data, new_size);
llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) {
const auto * logits = llama_get_logits_ith(ctx, idx);
- const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+ const llama_model * model = llama_get_model(ctx);
+ const llama_vocab * vocab = llama_model_get_vocab(model);
+
+ const int n_vocab = llama_vocab_n_tokens(vocab);
// TODO: do not allocate each time
std::vector<llama_token_data> cur;
static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) {
const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;
- auto * result = llama_sampler_init_grammar_impl(*ctx->vocab, nullptr, nullptr);
+ auto * result = llama_sampler_init_grammar(ctx->vocab, nullptr, nullptr);
// copy the state
{
/* .free = */ llama_sampler_grammar_free,
};
-struct llama_sampler * llama_sampler_init_grammar_impl(const struct llama_vocab & vocab, const char * grammar_str, const char * grammar_root) {
+struct llama_sampler * llama_sampler_init_grammar(const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root) {
auto * ctx = new llama_sampler_grammar;
if (grammar_str != nullptr && grammar_str[0] != '\0') {
*ctx = {
- /* .vocab = */ &vocab,
+ /* .vocab = */ vocab,
/* .grammar_str = */ grammar_str,
/* .grammar_root = */ grammar_root,
- /* .grammar = */ llama_grammar_init_impl(&vocab, grammar_str, grammar_root),
+ /* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root),
};
} else {
*ctx = {
- /* .vocab = */ &vocab,
+ /* .vocab = */ vocab,
/* .grammar_str = */ {},
/* .grammar_root = */ {},
/* .grammar = */ nullptr,
// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
static void get_overlapping_token_sequences(const llama_vocab & vocab, const std::string& str, std::unordered_multimap<llama_token, std::vector<llama_token>>& token_sequences, int max_tail_len = -1) {
- for (llama_token token_id = 0; token_id < (llama_token)vocab.n_vocab; token_id++) {
- std::string word = llama_detokenize(vocab, {token_id}, true);
+ for (llama_token token_id = 0; token_id < (llama_token) vocab.n_tokens(); token_id++) {
+ std::string word = vocab.detokenize({token_id}, true);
if (word.find(str) != std::string::npos) {
token_sequences.emplace(token_id, std::vector<llama_token>());
} else {
}
}
if (match) {
- std::vector<llama_token> tokenization = llama_tokenize_internal(vocab, str.substr(i), false, false);
+ std::vector<llama_token> tokenization = vocab.tokenize(str.substr(i), false, false);
if (max_tail_len >= 0 && tokenization.size() > (size_t)max_tail_len) {
tokenization.resize(max_tail_len);
}
llama_vocab dummy_vocab;
// dummy vocab is passed because it is only needed for raw sequence breaker processing, which we have already done and will simply be copying
- auto * result = llama_sampler_init_dry_impl(dummy_vocab, ctx->total_context_size, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
+ auto * result = llama_sampler_init_dry(&dummy_vocab, ctx->total_context_size, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
// Copy the state, including the processed breakers
{
/* .free = */ llama_sampler_dry_free,
};
-struct llama_sampler * llama_sampler_init_dry_impl(const struct llama_vocab & vocab, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
+struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? context_size : std::max(dry_penalty_last_n, 0);
std::unordered_multimap<llama_token, std::vector<llama_token>> processed_breakers;
const int MAX_CHAR_LEN = 40;
sequence_break.resize(MAX_CHAR_LEN);
}
- get_overlapping_token_sequences(vocab, sequence_break, processed_breakers, MAX_SEQ_LEN);
+ get_overlapping_token_sequences(*vocab, sequence_break, processed_breakers, MAX_SEQ_LEN);
}
}
// wrapper for test-sampling.cpp
struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector<std::vector<llama_token>>& seq_breakers) {
llama_vocab dummy_vocab;
- auto * result = llama_sampler_init_dry_impl(dummy_vocab, context_size, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, NULL, 0);
+ auto * result = llama_sampler_init_dry(&dummy_vocab, context_size, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, NULL, 0);
auto * ctx = (llama_sampler_dry *) result->ctx;
// Process the token-based sequence breakers
float p_eog_sum = 0.0f;
for (size_t i = 0; i < cur_p->size; ++i) {
- if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
+ if (ctx->vocab->is_eog(cur_p->data[i].id)) {
p_eog_sum += cur_p->data[i].p;
} else {
p_txt_sum += cur_p->data[i].p;
float p_sum = 0.0f;
for (size_t i = 0; i < size_org; ++i) {
- if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
+ if (ctx->vocab->is_eog(cur_p->data[i].id)) {
p_sum += cur_p->data[i].p;
cur_p->data[cur_p->size++] = cur_p->data[i];
continue;
}
- int len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
+ int len0 = ctx->vocab->token_to_piece(cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
if (len0 < 0) {
ctx->buf0.resize(len0);
- len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
+ len0 = ctx->vocab->token_to_piece(cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
assert(len0 > 0);
}
- int len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
+ int len1 = ctx->vocab->token_to_piece(cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
if (len1 < 0) {
ctx->buf1.resize(len1);
- len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
+ len1 = ctx->vocab->token_to_piece(cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
assert(len1 > 0);
}
LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold);
for (size_t i = 0; i < size_org; ++i) {
- const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
+ const bool is_eog = ctx->vocab->is_eog(cur_p->data[i].id);
if (cur_p->data[i].p < thold && !is_eog) {
continue;
// if no non-EOG tokens are left -> reduce cur_p to single EOT token
if (n_non_eog == 0) {
cur_p->size = 1;
- cur_p->data[0].id = llama_token_eot_impl(*ctx->vocab);
+ cur_p->data[0].id = ctx->vocab->token_eot();
cur_p->data[0].logit = 1.0f;
return;
LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold);
for (size_t i = 0; i < size_org; ++i) {
- const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
+ const bool is_eog = ctx->vocab->is_eog(cur_p->data[i].id);
if (cur_p->data[i].p < thold && !is_eog) {
continue;
static struct llama_sampler * llama_sampler_infill_clone(const struct llama_sampler * smpl) {
const auto * ctx = (const llama_sampler_infill *) smpl->ctx;
- return llama_sampler_init_infill_impl(*ctx->vocab);
+ return llama_sampler_init_infill(ctx->vocab);
}
static void llama_sampler_infill_free(struct llama_sampler * smpl) {
/* .free = */ llama_sampler_infill_free,
};
-struct llama_sampler * llama_sampler_init_infill_impl(
- const struct llama_vocab & vocab) {
+struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab) {
return new llama_sampler {
/* .iface = */ &llama_sampler_infill_i,
/* .ctx = */ new llama_sampler_infill {
- /* .vocab = */ &vocab,
- /* .buf0 = */ std::vector<char>(512),
- /* .buf1 = */ std::vector<char>(512),
+ /* .vocab = */ vocab,
+ /* .buf0 = */ std::vector<char>(512),
+ /* .buf1 = */ std::vector<char>(512),
},
};
}
// TODO: rename llama-sampling.h/.cpp to llama-sampler.h/.cpp ?
-#include "llama-grammar.h"
+#include "llama.h"
+
+#include <vector>
struct llama_vocab;
struct llama_grammar;
mutable int32_t n_sample;
};
-struct llama_sampler * llama_sampler_init_grammar_impl(
- const struct llama_vocab & vocab,
- const char * grammar_str,
- const char * grammar_root);
-
-struct llama_sampler * llama_sampler_init_infill_impl(
- const struct llama_vocab & vocab);
-
-struct llama_sampler * llama_sampler_init_dry_impl(
- const struct llama_vocab & vocab,
- int32_t context_size,
- float dry_multiplier,
- float dry_base,
- int32_t dry_allowed_length,
- int32_t dry_penalty_last_n,
- const char ** seq_breakers,
- size_t num_breakers);
-
struct llama_sampler * llama_sampler_init_dry_testing(
int32_t context_size,
float dry_multiplier,
#include "llama-vocab.h"
#include "llama-impl.h"
+#include "llama-model-loader.h"
#include "unicode.h"
#include <cstdarg>
#include <cstring>
#include <forward_list>
+#include <map>
#include <queue>
-#include <sstream>
+#include <set>
+#include <unordered_map>
//
// helpers
};
//
-// impl
+// tokenizers
//
struct llm_tokenizer {
- llm_tokenizer() {}
- virtual ~llm_tokenizer() = default;
+ llm_tokenizer() {}
+ virtual ~llm_tokenizer() = default;
};
-llama_vocab::~llama_vocab() {
- delete tokenizer;
-}
-
-int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
- GGML_ASSERT(token_left.find(' ') == std::string::npos);
- GGML_ASSERT(token_left.find('\n') == std::string::npos);
- GGML_ASSERT(token_right.find(' ') == std::string::npos);
- GGML_ASSERT(token_right.find('\n') == std::string::npos);
-
- auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
- if (it == bpe_ranks.end()) {
- return -1;
- }
-
- return it->second;
-}
-
-static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
- return vocab.type;
-}
-
-static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
- GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
- return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
-}
-
-static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
- GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
- return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
-}
-
-static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
- GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
- return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
-}
-
-static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
- GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
- return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
-}
-
-static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
- GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
- return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
-}
-
-static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
- GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
- return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED;
-}
-
-static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
- GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
- GGML_ASSERT(llama_is_byte_token(vocab, id));
- const auto & token_data = vocab.id_to_token.at(id);
- switch (llama_vocab_get_type(vocab)) {
- case LLAMA_VOCAB_TYPE_SPM:
- case LLAMA_VOCAB_TYPE_UGM: {
- auto buf = token_data.text.substr(3, 2);
- return strtol(buf.c_str(), NULL, 16);
- }
- case LLAMA_VOCAB_TYPE_BPE: {
- GGML_ABORT("fatal error");
- //return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
- }
- case LLAMA_VOCAB_TYPE_WPM: {
- GGML_ABORT("fatal error");
- }
- default:
- GGML_ABORT("fatal error");
- }
-}
-
-static void llama_escape_whitespace(std::string & text) {
- replace_all(text, " ", "\xe2\x96\x81");
-}
-
-static void llama_unescape_whitespace(std::string & word) {
- replace_all(word, "\xe2\x96\x81", " ");
-}
-
struct llm_symbol {
using index = int;
index prev;
};
struct llm_tokenizer_spm : llm_tokenizer {
- llm_tokenizer_spm(const llama_vocab & /*vocab*/) : llm_tokenizer() {}
+ llm_tokenizer_spm(const llama_vocab & /*vocab*/) {}
};
struct llm_tokenizer_spm_session {
llm_tokenizer_spm_session(const llama_vocab & vocab) : vocab(vocab) {}
- void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
-
+ void tokenize(const std::string & text, std::vector<llama_token> & output) {
// split string into utf8 chars
int index = 0;
size_t offs = 0;
}
private:
- void resegment(llm_symbol & symbol, std::vector<llama_vocab::id> & output) {
+ void resegment(llm_symbol & symbol, std::vector<llama_token> & output) {
auto text = std::string(symbol.text, symbol.n);
- auto token = vocab.token_to_id.find(text);
+ auto token = vocab.text_to_token(text);
// Do we need to support is_unused?
- if (token != vocab.token_to_id.end()) {
- output.push_back((*token).second);
+ if (token != LLAMA_TOKEN_NULL) {
+ output.push_back(token);
return;
}
// output any symbols that did not form tokens as bytes.
output.reserve(output.size() + symbol.n);
for (int j = 0; j < (int)symbol.n; ++j) {
- llama_vocab::id token_id = llama_byte_to_token_impl(vocab, symbol.text[j]);
- output.push_back(token_id);
+ llama_token id = vocab.byte_to_token(symbol.text[j]);
+ output.push_back(id);
}
return;
}
return;
}
const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n);
- auto token = vocab.token_to_id.find(text);
+ auto token = vocab.text_to_token(text);
- if (token == vocab.token_to_id.end()) {
+ if (token == LLAMA_TOKEN_NULL) {
return;
}
- if (static_cast<size_t>((*token).second) >= vocab.id_to_token.size()) {
+ if (static_cast<uint32_t>(token) >= vocab.n_tokens()) {
return;
}
- const auto & tok_data = vocab.id_to_token[(*token).second];
+ const auto & tok_data = vocab.get_token_data(token);
llm_bigram_spm bigram;
bigram.left = left;
};
struct llm_tokenizer_bpe : llm_tokenizer {
- llm_tokenizer_bpe(const llama_vocab & vocab) : llm_tokenizer() {
- GGML_ASSERT(vocab.type == LLAMA_VOCAB_TYPE_BPE);
- switch (vocab.type_pre) {
+ llm_tokenizer_bpe(const llama_vocab & vocab) {
+ GGML_ASSERT(vocab.get_type() == LLAMA_VOCAB_TYPE_BPE);
+ switch (vocab.get_pre_type()) {
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
regex_exprs = {
// original regex from tokenizer.json
};
struct llm_tokenizer_bpe_session {
- llm_tokenizer_bpe_session(const llama_vocab & vocab) : vocab(vocab),
- bpe_tokenizer(static_cast<const llm_tokenizer_bpe *>(vocab.tokenizer)) {}
+ llm_tokenizer_bpe_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
- static void append(const llama_vocab::id token_id, std::vector<llama_vocab::id> & output) {
+ static void append(const llama_token token_id, std::vector<llama_token> & output) {
output.push_back(token_id);
}
- bool append_bos(std::vector<llama_vocab::id> & output) const {
- if (vocab.tokenizer_add_bos) {
- GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL);
- output.push_back(vocab.special_bos_id);
+ bool append_bos(std::vector<llama_token> & output) const {
+ if (vocab.get_add_bos()) {
+ GGML_ASSERT(vocab.token_bos() != LLAMA_TOKEN_NULL);
+ output.push_back(vocab.token_bos());
return true;
}
return false;
}
- bool append_eos(std::vector<llama_vocab::id> & output) const {
- if (vocab.tokenizer_add_eos) {
- GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL);
- output.push_back(vocab.special_eos_id);
+ bool append_eos(std::vector<llama_token> & output) const {
+ if (vocab.get_add_eos()) {
+ GGML_ASSERT(vocab.token_eos() != LLAMA_TOKEN_NULL);
+ output.push_back(vocab.token_eos());
return true;
}
return false;
}
- void check_double_bos_eos(const std::vector<llama_vocab::id> & output) const {
- if (vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
+ void check_double_bos_eos(const std::vector<llama_token> & output) const {
+ if (vocab.get_add_bos() && output.size() >= 2 && output[1] == vocab.token_bos()) {
LLAMA_LOG_WARN(
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
"Are you sure this is what you want?\n", __FUNCTION__);
}
- if (vocab.tokenizer_add_eos && output.size() >= 2 && *(output.end()-2) == vocab.special_eos_id) {
+ if (vocab.get_add_bos() && output.size() >= 2 && *(output.end()-2) == vocab.token_eos()) {
LLAMA_LOG_WARN(
"%s: Added a EOS token to the prompt as specified by the model but the prompt "
"also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. "
}
}
- void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
+ void tokenize(const std::string & text, std::vector<llama_token> & output) {
int final_prev_index = -1;
- const auto word_collection = unicode_regex_split(text, bpe_tokenizer->regex_exprs);
+ const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs);
symbols_final.clear();
int index = 0;
size_t offset = 0;
- if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
+ //if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
+ if (vocab.get_ignore_merges() && vocab.text_to_token(word) != LLAMA_TOKEN_NULL) {
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
offset = word.size();
}
}
const std::string str = std::string(symbol.text, symbol.n);
- const auto token = vocab.token_to_id.find(str);
+ const auto token = vocab.text_to_token(str);
- if (token == vocab.token_to_id.end()) {
+ if (token == LLAMA_TOKEN_NULL) {
for (auto j = str.begin(); j != str.end(); ++j) {
std::string byte_str(1, *j);
- auto token_multibyte = vocab.token_to_id.find(byte_str);
- if (token_multibyte != vocab.token_to_id.end()) {
- output.push_back(token_multibyte->second);
+ auto token_multibyte = vocab.text_to_token(byte_str);
+ if (token_multibyte != LLAMA_TOKEN_NULL) {
+ output.push_back(token_multibyte);
}
}
} else {
- output.push_back((*token).second);
+ output.push_back(token);
}
}
}
}
const llama_vocab & vocab;
- const llm_tokenizer_bpe * bpe_tokenizer;
+ const llm_tokenizer_bpe & tokenizer;
std::vector<llm_symbol> symbols;
std::vector<llm_symbol> symbols_final;
//
struct llm_tokenizer_wpm : llm_tokenizer {
- llm_tokenizer_wpm(const llama_vocab & /*vocab*/) : llm_tokenizer() {}
+ llm_tokenizer_wpm(const llama_vocab & /*vocab*/) {}
};
struct llm_tokenizer_wpm_session {
llm_tokenizer_wpm_session(const llama_vocab & vocab) : vocab(vocab) {}
- void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
- const auto & token_map = vocab.token_to_id;
+ void tokenize(const std::string & text, std::vector<llama_token> & output) {
// normalize and split by whitespace
std::vector<std::string> words = preprocess(text);
// bos token prepended already
for (int i = 0; i < n; ++i) {
// loop through possible match length
bool match = false;
- for (int j = std::min(n, i + vocab.max_token_len + 1); j > i; j--) {
- auto it = token_map.find(word1.substr(i, j - i));
- if (it != token_map.end()) {
- output.push_back(it->second);
+ for (int j = std::min(n, i + vocab.max_token_len() + 1); j > i; j--) {
+ auto id = vocab.text_to_token(word1.substr(i, j - i));
+ if (id != LLAMA_TOKEN_NULL) {
+ output.push_back(id);
match = true;
i = j - 1;
break;
// we didn't find any matches for this word
if (current_tokens == output.size()) {
- output.push_back(vocab.special_unk_id);
+ output.push_back(vocab.token_unk());
}
}
}
//
struct llm_tokenizer_ugm : llm_tokenizer {
- llm_tokenizer_ugm(const llama_vocab & vocab) : llm_tokenizer() {
- if (vocab.precompiled_charsmap.size() > 0) {
+ llm_tokenizer_ugm(const llama_vocab & vocab, const std::vector<char> & precompiled_charsmap) {
+ if (precompiled_charsmap.size() > 0) {
size_t charsmap_offset = 0;
// First four bytes of precompiled_charsmap contains length of binary
// blob containing XOR-compressed compact double array (XCDA) entries
- uint32_t xcda_blob_size = *(const uint32_t *) &vocab.precompiled_charsmap[0];
+ uint32_t xcda_blob_size = *(const uint32_t *) &precompiled_charsmap[0];
charsmap_offset += sizeof(xcda_blob_size);
- if (xcda_blob_size + charsmap_offset >= vocab.precompiled_charsmap.size()) {
+ if (xcda_blob_size + charsmap_offset >= precompiled_charsmap.size()) {
throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
}
// Next xcda_blob_size bytes contain entries of XOR-compressed compact
// double array (XCDA). Each entry is bit-packed into a 32-bit integer.
- xcda_array = (const uint32_t *) &vocab.precompiled_charsmap[charsmap_offset];
+ xcda_array = (const uint32_t *) &precompiled_charsmap[charsmap_offset];
xcda_array_size = xcda_blob_size / sizeof(uint32_t);
charsmap_offset += xcda_blob_size;
// Remaining bytes of precompiled charsmap contain null-terminated
// replacement strings for prefixes matched by the XCDA.
- prefix_replacements = &vocab.precompiled_charsmap[charsmap_offset];
- prefix_replacements_size = vocab.precompiled_charsmap.size() - charsmap_offset;
+ prefix_replacements = &precompiled_charsmap[charsmap_offset];
+ prefix_replacements_size = precompiled_charsmap.size() - charsmap_offset;
}
- for (unsigned int id = 0; id < vocab.id_to_token.size(); ++id) {
- const auto &token_data = vocab.id_to_token[id];
+ for (uint32_t id = 0; id < vocab.n_tokens(); ++id) {
+ const auto & token_data = vocab.get_token_data(id);
- if (llama_is_normal_token(vocab, id)) {
+ if (vocab.is_normal(id)) {
min_score = std::min<float>(min_score, token_data.score);
max_score = std::max<float>(max_score, token_data.score);
}
- if (llama_is_normal_token(vocab, id) ||
- llama_is_user_defined_token(vocab, id) ||
- llama_is_unused_token(vocab, id)) {
+ if (vocab.is_normal(id) ||
+ vocab.is_user_defined(id) ||
+ vocab.is_unused(id)) {
token_matcher.insert(token_data.text.data(), token_data.text.size(), id);
}
- if (llama_is_user_defined_token(vocab, id)) {
+ if (vocab.is_user_defined(id)) {
user_defined_token_matcher.insert(token_data.text.data(), token_data.text.size());
}
}
};
struct llm_tokenizer_ugm_session {
- llm_tokenizer_ugm_session(const llama_vocab & vocab) : vocab(vocab),
- ugm_tokenizer(static_cast<const llm_tokenizer_ugm *>(vocab.tokenizer)) {}
+ llm_tokenizer_ugm_session(const llama_vocab & vocab, const llm_tokenizer_ugm & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
/* This implementation is based on SentencePiece optimized Viterbi algorithm for
* unigram language models. The general idea is to:
* After processing the whole sequence we backtrack from the end to get
* the best tokenization.
*/
- void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
+ void tokenize(const std::string & text, std::vector<llama_token> & output) {
// get current size of output (for reversal later)
size_t output_size = output.size();
}
// initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
- std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.special_unk_id, 0, -FLT_MAX});
+ std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -FLT_MAX});
// at the beginning tokenization score is zero
- tokenization_results[0] = { vocab.special_unk_id, 0, 0 };
+ tokenization_results[0] = { vocab.token_unk(), 0, 0 };
for (size_t input_offset = 0; input_offset < input_len;) {
size_t prefix_offset = input_offset;
// traverse the token matcher trie to find a matching token
bool single_codepoint_token_found = false;
const struct best_tokenization & current_best = tokenization_results[input_offset];
- const struct naive_trie * node = ugm_tokenizer->token_matcher.traverse(normalized[prefix_offset++]);
+ const struct naive_trie * node = tokenizer.token_matcher.traverse(normalized[prefix_offset++]);
while (prefix_offset <= input_len && node != NULL) {
// check if we found valid token in prefix
single_codepoint_token_found = true;
}
llama_token token_id = node->value;
- const auto & token_data = vocab.id_to_token[token_id];
+ const auto & token_data = vocab.get_token_data(token_id);
// we set the user-defined token scores to 0 to make them more likely to be selected
// (normal token scores are log probabilities, so they are negative)
// score type is double here to make tokenization results exactly
// the same as in the HF tokenizer using SentencePiece
- const double token_score = llama_is_user_defined_token(vocab, token_id) ? 0.0 : token_data.score;
+ const double token_score = vocab.is_user_defined(token_id) ? 0.0 : token_data.score;
const double challenger_score = current_best.score_sum + token_score;
struct best_tokenization & current_champ = tokenization_results[prefix_offset];
if (challenger_score > current_champ.score_sum) {
// if we didn't find a valid token corresponding to the whole UTF code point
// then use unknown token as the tokenization of this UTF code point
if (!single_codepoint_token_found) {
- const double challenger_score = current_best.score_sum + ugm_tokenizer->unknown_token_score;
+ const double challenger_score = current_best.score_sum + tokenizer.unknown_token_score;
prefix_offset = input_offset + n_utf8_code_units;
struct best_tokenization & current_champ = tokenization_results[prefix_offset];
if (challenger_score > current_champ.score_sum) {
- struct best_tokenization challenger = { vocab.special_unk_id, input_offset, (float) challenger_score };
+ struct best_tokenization challenger = { vocab.token_unk(), input_offset, (float) challenger_score };
current_champ = challenger;
}
}
// merge sequences of consecutive unknown tokens into single unknown tokens
bool is_prev_unknown = false;
for (struct best_tokenization & tokenization = tokenization_results[input_len]; ; tokenization = tokenization_results[tokenization.input_offset]) {
- bool is_unknown = tokenization.token_id == vocab.special_unk_id;
+ bool is_unknown = tokenization.token_id == vocab.token_unk();
if (!(is_prev_unknown && is_unknown)) {
output.push_back(tokenization.token_id);
}
normalized->clear();
normalized->reserve(input.size() * 3);
- const std::string space = vocab.tokenizer_escape_whitespaces ? ugm_tokenizer->escaped_space : " ";
+ const std::string space = vocab.get_escape_whitespaces() ? tokenizer.escaped_space : " ";
- bool shall_prepend_space = !vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
- bool shall_append_space = vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
- bool shall_merge_spaces = vocab.tokenizer_remove_extra_whitespaces;
+ const bool shall_prepend_space = !vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix();
+ const bool shall_append_space = vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix();
+ const bool shall_merge_spaces = vocab.get_remove_extra_whitespaces();
bool is_space_prepended = false;
bool processing_non_ws = false;
// if input prefix matches some user-defined token return this token as normalization result
auto user_defined_token_match =
- ugm_tokenizer->user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset);
+ tokenizer.user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset);
if (user_defined_token_match.second > 0) {
return { &input[input_offset], user_defined_token_match.second, user_defined_token_match.second };
}
size_t longest_prefix_length = 0;
size_t longest_prefix_offset = 0;
- if (ugm_tokenizer->xcda_array_size > 0) {
- struct xcda_array_view xcda_view(ugm_tokenizer->xcda_array, ugm_tokenizer->xcda_array_size);
+ if (tokenizer.xcda_array_size > 0) {
+ struct xcda_array_view xcda_view(tokenizer.xcda_array, tokenizer.xcda_array_size);
// Find the longest normalized sequence matching the input prefix by walking
// the XOR-compressed compact double array (XCDA) starting from the root node
if (longest_prefix_length > 0) {
// we have a match, so return the replacement sequence
- if (longest_prefix_offset >= ugm_tokenizer->prefix_replacements_size) {
+ if (longest_prefix_offset >= tokenizer.prefix_replacements_size) {
throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
}
- const char * prefix_replacement = &(ugm_tokenizer->prefix_replacements)[longest_prefix_offset];
+ const char * prefix_replacement = &(tokenizer.prefix_replacements)[longest_prefix_offset];
return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length };
}
}
const llama_vocab & vocab;
- const llm_tokenizer_ugm * ugm_tokenizer;
+ const llm_tokenizer_ugm & tokenizer;
};
//
}
struct llm_tokenizer_rwkv : llm_tokenizer {
- llm_tokenizer_rwkv(const llama_vocab & vocab) : llm_tokenizer() {
+ llm_tokenizer_rwkv(const llama_vocab & vocab) {
// RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens.
// For now, we decode the vocab here into the lookup we'll use for tokenization.
// build trie
- for (unsigned int id = 0; id < vocab.id_to_token.size(); ++id) {
- const auto & token = vocab.id_to_token[id];
- const auto data = llama_unescape_rwkv_token(token.text);
- token_matcher.insert((const char *) data.data(), data.size(), id);
+ for (uint32_t id = 0; id < vocab.n_tokens(); ++id) {
+ const auto & data = vocab.get_token_data(id);
+ const auto text = llama_unescape_rwkv_token(data.text);
+ token_matcher.insert((const char *) text.data(), text.size(), id);
}
}
};
struct llm_tokenizer_rwkv_session {
- llm_tokenizer_rwkv_session(const llama_vocab & vocab) : vocab(vocab),
- rwkv_tokenizer(static_cast<const llm_tokenizer_rwkv &>(*vocab.tokenizer)) {}
+ llm_tokenizer_rwkv_session(const llama_vocab & vocab, const llm_tokenizer_rwkv & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
- void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
+ void tokenize(const std::string & text, std::vector<llama_token> & output) {
uint32_t position = 0;
while (position < text.size()) {
- const struct naive_trie * node = rwkv_tokenizer.token_matcher.traverse(text[position]);
+ const struct naive_trie * node = tokenizer.token_matcher.traverse(text[position]);
if (node == NULL) {
// no matching token found, add unknown token
- output.push_back(vocab.special_unk_id);
+ output.push_back(vocab.token_unk());
position += 1;
continue;
}
private:
const llama_vocab & vocab;
- const llm_tokenizer_rwkv & rwkv_tokenizer;
+ const llm_tokenizer_rwkv & tokenizer;
};
-void llama_vocab::init_tokenizer() {
- switch (type) {
- case LLAMA_VOCAB_TYPE_SPM:
- tokenizer = new llm_tokenizer_spm(*this);
- break;
- case LLAMA_VOCAB_TYPE_BPE:
- tokenizer = new llm_tokenizer_bpe(*this);
- break;
- case LLAMA_VOCAB_TYPE_WPM:
- tokenizer = new llm_tokenizer_wpm(*this);
- break;
- case LLAMA_VOCAB_TYPE_UGM:
- tokenizer = new llm_tokenizer_ugm(*this);
- break;
- case LLAMA_VOCAB_TYPE_RWKV:
- tokenizer = new llm_tokenizer_rwkv(*this);
- break;
- default:
- GGML_ABORT("unsupported vocab type");
- }
-}
-
//
-// (de-) tokenize
+// impl
//
typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
} FRAGMENT_BUFFER_VARIANT_TYPE;
struct fragment_buffer_variant {
- fragment_buffer_variant(llama_vocab::id _token)
+ fragment_buffer_variant(llama_token _token)
:
type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
token(_token),
fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
:
type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
- token((llama_vocab::id) - 1),
+ token((llama_token) - 1),
raw_text(_raw_text),
offset(_offset),
length(_length){
}
const FRAGMENT_BUFFER_VARIANT_TYPE type;
- const llama_vocab::id token;
+ const llama_token token;
const std::string _dummy;
const std::string & raw_text;
const uint64_t offset;
const uint64_t length;
};
-// #define PRETOKENIZERDEBUG
+struct llama_vocab::impl {
+ uint32_t n_token_types = 0; // for BERT-style token types
-static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) {
- // for each special token
- for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
- const auto & data = vocab.id_to_token[special_id];
- const auto & special_token = data.text;
+ enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
+ enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
- if (!parse_special && (data.attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_UNKNOWN))) {
- // Ignore control and unknown tokens when parse_special == false
- continue;
- // User-defined tokens are still pre-tokenized before everything else
- // ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726
- // This is mostly relevant for neox-style tokenizers (mpt, olmo, stablelm, etc.)
- }
+ int max_token_len = 0; // used for optimizing longest token search
- // for each text fragment
- std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
- while (it != buffer.end()) {
- auto & fragment = (*it);
+ // default LLaMA special tokens
+ // TODO: should we set all of these to LLAMA_TOKEN_NULL?
+ llama_token special_bos_id = 1;
+ llama_token special_eos_id = 2;
+ llama_token special_eot_id = LLAMA_TOKEN_NULL;
+ llama_token special_eom_id = LLAMA_TOKEN_NULL;
+ llama_token special_unk_id = 0;
+ llama_token special_sep_id = LLAMA_TOKEN_NULL;
+ llama_token special_pad_id = LLAMA_TOKEN_NULL;
+ llama_token special_mask_id = LLAMA_TOKEN_NULL;
- // if a fragment is text ( not yet processed )
- if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
- const auto & raw_text = fragment.raw_text;
+ llama_token linefeed_id = 13;
- auto raw_text_base_offset = fragment.offset;
- auto raw_text_base_length = fragment.length;
+ // fim tokens
+ llama_token special_fim_pre_id = LLAMA_TOKEN_NULL;
+ llama_token special_fim_suf_id = LLAMA_TOKEN_NULL;
+ llama_token special_fim_mid_id = LLAMA_TOKEN_NULL;
+ llama_token special_fim_pad_id = LLAMA_TOKEN_NULL;
+ llama_token special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
+ llama_token special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
- // loop over the text
- while (true) {
- // find the first occurrence of a given special token in this fragment
- // passing offset argument only limit the "search area" but match coordinates
- // are still relative to the source full raw_text
- auto match = raw_text.find(special_token, raw_text_base_offset);
+ // tokenizer flags
+ bool add_space_prefix = false;
+ bool add_bos = false;
+ bool add_eos = false;
+ bool ignore_merges = false;
+ bool clean_spaces = false; // clean_up_tokenization_spaces
+ bool remove_extra_whitespaces = false;
+ bool escape_whitespaces = true;
+ bool treat_whitespace_as_suffix = false;
- // no occurrences found, stop processing this fragment for a given special token
- if (match == std::string::npos) break;
+ std::unordered_map<std::string, llama_token> token_to_id;
+ std::vector<token_data> id_to_token;
- // check if match is within bounds of offset <-> length
- if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break;
+ std::vector<llama_token> cache_special_tokens;
+ std::vector<std::string> cache_token_to_piece; // llama_token_to_piece(special = true);
-#ifdef PRETOKENIZERDEBUG
- LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
-#endif
- auto source = std::distance(buffer.begin(), it);
+ std::map<std::pair<std::string, std::string>, int> bpe_ranks;
- // if match is further than base offset
- // then we have some text to the left of it
- if (match > raw_text_base_offset) {
- // left
- const int64_t left_reminder_offset = raw_text_base_offset + 0;
- int64_t left_reminder_length = match - raw_text_base_offset;
+ // set of all tokens that cause "end of generation"
+ std::set<llama_token> special_eog_ids;
- if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
- while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
- left_reminder_length--;
- }
- }
+ std::unique_ptr<llm_tokenizer> tokenizer;
- if (left_reminder_length > 0) {
- buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
- it++;
- }
+ std::vector<char> precompiled_charsmap;
-#ifdef PRETOKENIZERDEBUG
- LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
-#endif
- }
+ impl(const llama_vocab & vocab) : vocab(vocab) {
+ }
- // special token
- buffer.emplace_after(it, special_id);
- it++;
+ ~impl() = default;
- // right
- if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
- int64_t right_reminder_offset = match + special_token.length();
- int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
+ void load(llama_model_loader & ml, const LLM_KV & kv);
- if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
- while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
- right_reminder_offset++;
- right_reminder_length--;
- }
- }
+ enum llama_vocab_type get_type() const;
- if (right_reminder_length > 0) {
- buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
- it++;
- }
+ std::string type_name() const;
-#ifdef PRETOKENIZERDEBUG
- LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
-#endif
+ bool is_normal (llama_token id) const;
+ bool is_unknown (llama_token id) const;
+ bool is_control (llama_token id) const;
+ bool is_byte (llama_token id) const;
+ bool is_user_defined(llama_token id) const;
+ bool is_unused (llama_token id) const;
+ bool is_eog (llama_token id) const;
- if (source == 0) {
- buffer.erase_after(buffer.before_begin());
- } else {
- buffer.erase_after(std::next(buffer.begin(), (source - 1)));
- }
+ uint8_t token_to_byte(llama_token id) const;
- // repeat for the right side
- raw_text_base_offset = right_reminder_offset;
- raw_text_base_length = right_reminder_length;
+ llama_token_attr token_get_attr(llama_token id) const;
-#ifdef PRETOKENIZERDEBUG
- LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
-#endif
- } else {
- if (source == 0) {
- buffer.erase_after(buffer.before_begin());
- } else {
- buffer.erase_after(std::next(buffer.begin(), (source - 1)));
- }
- break;
- }
- }
- }
- it++;
- }
- }
-}
+ void init_tokenizer(enum llama_vocab_type type);
-std::vector<llama_vocab::id> llama_tokenize_internal(
- const llama_vocab & vocab,
- std::string raw_text,
- bool add_special,
- bool parse_special) {
- GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
+ void tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const;
- std::vector<llama_vocab::id> output;
- std::forward_list<fragment_buffer_variant> fragment_buffer;
+ std::string token_to_piece_for_cache(
+ llama_token token,
+ bool special) const;
- if (!raw_text.empty()) {
- fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
- tokenizer_st_partition(vocab, fragment_buffer, parse_special);
- }
- switch (vocab.type) {
- case LLAMA_VOCAB_TYPE_SPM:
- {
- // OG tokenizer behavior:
- //
- // tokenizer.encode('', add_special_tokens=True) returns [1]
- // tokenizer.encode('', add_special_tokens=False) returns []
+ std::vector<llama_token> tokenize(
+ const std::string & raw_text,
+ bool add_special,
+ bool parse_special = false) const;
- bool is_prev_special = true; // prefix with space if first token
+ int32_t tokenize(
+ const char * text,
+ int32_t text_len,
+ llama_token * tokens,
+ int32_t n_tokens_max,
+ bool add_special,
+ bool parse_special) const;
- if (add_special && vocab.tokenizer_add_bos) {
- GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL);
- output.push_back(vocab.special_bos_id);
- is_prev_special = true;
- }
+ // does not write null-terminator to buf
+ int32_t token_to_piece(
+ llama_token token,
+ char * buf,
+ int32_t length,
+ int32_t lstrip,
+ bool special) const;
- for (const auto & fragment : fragment_buffer) {
- if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
- auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
+ // use cached data
+ const std::string & token_to_piece(llama_token token) const;
- // prefix with space if previous is special
- if (vocab.tokenizer_add_space_prefix && is_prev_special) {
- raw_text = " " + raw_text;
- }
+ int32_t detokenize(
+ const llama_token * tokens,
+ int32_t n_tokens,
+ char * text,
+ int32_t text_len_max,
+ bool remove_special,
+ bool unparse_special) const;
-#ifdef PRETOKENIZERDEBUG
- LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
-#endif
- llama_escape_whitespace(raw_text);
- llm_tokenizer_spm_session session(vocab);
- session.tokenize(raw_text, output);
- is_prev_special = false;
- } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
- output.push_back(fragment.token);
- is_prev_special = true;
- }
- }
+ std::string detokenize(
+ const std::vector<llama_token> & tokens,
+ bool special) const;
- if (add_special && vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
- LLAMA_LOG_WARN(
- "%s: Added a BOS token to the prompt as specified by the model but the prompt "
- "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
- "Are you sure this is what you want?\n", __FUNCTION__);
- }
+ void print_info() const;
- if (add_special && vocab.tokenizer_add_eos) {
- GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL);
- output.push_back(vocab.special_eos_id);
- }
- } break;
- case LLAMA_VOCAB_TYPE_BPE:
- {
- llm_tokenizer_bpe_session session(vocab);
- // it calls some other methods that are not exist in llm_tokenizer,
- // here just cast it to bpe tokenizer object
- if (add_special) {
- session.append_bos(output);
- }
- for (const auto & fragment : fragment_buffer) {
- if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
- auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
+private:
+ const llama_vocab & vocab;
+};
-#ifdef PRETOKENIZERDEBUG
- LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
-#endif
- session.tokenize(raw_text, output);
- } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
- session.append(fragment.token, output);
- }
- }
+void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
+ struct gguf_context * ctx = ml.meta.get();
- if (add_special) {
- session.append_eos(output);
- session.check_double_bos_eos(output);
- }
- } break;
- case LLAMA_VOCAB_TYPE_WPM:
- {
- if (add_special) {
- GGML_ASSERT(vocab.special_cls_id != LLAMA_TOKEN_NULL);
- output.push_back(vocab.special_cls_id);
- }
+ // determine vocab type
+ {
+ std::string tokenizer_model;
+ std::string tokenizer_pre;
+
+ ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
+ ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
+
+ ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, n_token_types, false);
+
+ if (tokenizer_model == "no_vocab" || tokenizer_model == "none") {
+ type = LLAMA_VOCAB_TYPE_NONE;
+
+ // default special tokens
+ special_bos_id = LLAMA_TOKEN_NULL;
+ special_eos_id = LLAMA_TOKEN_NULL;
+ special_unk_id = LLAMA_TOKEN_NULL;
+ special_sep_id = LLAMA_TOKEN_NULL;
+ special_pad_id = LLAMA_TOKEN_NULL;
+ special_mask_id = LLAMA_TOKEN_NULL;
+ linefeed_id = LLAMA_TOKEN_NULL;
+
+ // read vocab size from metadata
+ uint32_t n_tokens = 0;
+ if (!ml.get_key(LLM_KV_VOCAB_SIZE, n_tokens, false)) {
+ LLAMA_LOG_WARN("%s: there is no vocab_size in metadata\n", __func__);
+ }
- llm_tokenizer_wpm_session session(vocab);
+ return;
+ }
- for (const auto & fragment : fragment_buffer) {
- if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
- auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
+ if (tokenizer_model == "llama") {
+ type = LLAMA_VOCAB_TYPE_SPM;
+
+ // default special tokens
+ special_bos_id = 1;
+ special_eos_id = 2;
+ special_unk_id = 0;
+ special_sep_id = LLAMA_TOKEN_NULL;
+ special_pad_id = LLAMA_TOKEN_NULL;
+ special_mask_id = LLAMA_TOKEN_NULL;
+ } else if (tokenizer_model == "bert") {
+ type = LLAMA_VOCAB_TYPE_WPM;
+
+ // default special tokens
+ special_bos_id = 101;
+ special_eos_id = LLAMA_TOKEN_NULL;
+ special_unk_id = 100;
+ special_sep_id = 102;
+ special_pad_id = 0;
+ special_mask_id = 103;
+ } else if (tokenizer_model == "gpt2") {
+ type = LLAMA_VOCAB_TYPE_BPE;
+
+ // read bpe merges and populate bpe ranks
+ const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
+ if (merges_keyidx == -1) {
+ throw std::runtime_error("cannot find tokenizer merges in model file\n");
+ }
-#ifdef PRETOKENIZERDEBUG
- LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
-#endif
- session.tokenize(raw_text, output);
- } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
- output.push_back(fragment.token);
- }
- }
+ const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
+ for (int i = 0; i < n_merges; i++) {
+ const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
+ //GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
- if (add_special) {
- GGML_ASSERT(vocab.special_sep_id != LLAMA_TOKEN_NULL);
- output.push_back(vocab.special_sep_id);
- }
- } break;
- case LLAMA_VOCAB_TYPE_UGM:
- {
- if (add_special && vocab.tokenizer_add_bos) {
- GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL);
- output.push_back(vocab.special_bos_id);
- }
- llm_tokenizer_ugm_session session(vocab);
+ std::string first;
+ std::string second;
- for (const auto & fragment : fragment_buffer) {
- if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
- auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
-#ifdef PRETOKENIZERDEBUG
- LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
-#endif
- session.tokenize(raw_text, output);
- } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
- output.push_back(fragment.token);
- }
+ const size_t pos = word.find(' ', 1);
+
+ if (pos != std::string::npos) {
+ first = word.substr(0, pos);
+ second = word.substr(pos + 1);
}
- if (add_special && vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
- LLAMA_LOG_WARN(
- "%s: Added a BOS token to the prompt as specified by the model but the prompt "
- "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
- "Are you sure this is what you want?\n", __FUNCTION__);
+ bpe_ranks.emplace(std::make_pair(first, second), i);
+ }
+
+ // default special tokens
+ special_bos_id = 11;
+ special_eos_id = 11;
+ special_unk_id = LLAMA_TOKEN_NULL;
+ special_sep_id = LLAMA_TOKEN_NULL;
+ special_pad_id = LLAMA_TOKEN_NULL;
+ special_mask_id = LLAMA_TOKEN_NULL;
+ } else if (tokenizer_model == "t5") {
+ type = LLAMA_VOCAB_TYPE_UGM;
+
+ // default special tokens
+ special_bos_id = LLAMA_TOKEN_NULL;
+ special_eos_id = 1;
+ special_unk_id = 2;
+ special_sep_id = LLAMA_TOKEN_NULL;
+ special_pad_id = 0;
+ special_mask_id = LLAMA_TOKEN_NULL;
+
+ const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
+ if (precompiled_charsmap_keyidx != -1) {
+ size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
+ const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
+ precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
+#ifdef IS_BIG_ENDIAN
+ // correct endiannes of data in precompiled_charsmap binary blob
+ uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
+ *xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
+ assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
+ size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t);
+ uint32_t * xcda_array = (uint32_t *) &precompiled_charsmap[sizeof(uint32_t)];
+ for (size_t i = 0; i < xcda_array_size; ++i) {
+ xcda_array[i] = __builtin_bswap32(xcda_array[i]);
}
+#endif
+ }
+ } else if (tokenizer_model == "rwkv") {
+ type = LLAMA_VOCAB_TYPE_RWKV;
+
+ // default special tokens
+ special_bos_id = LLAMA_TOKEN_NULL;
+ special_eos_id = LLAMA_TOKEN_NULL;
+ special_unk_id = LLAMA_TOKEN_NULL;
+ special_sep_id = LLAMA_TOKEN_NULL;
+ special_pad_id = LLAMA_TOKEN_NULL;
+ } else {
+ throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
+ }
+
+ // for now, only BPE models have pre-tokenizers
+ if (type == LLAMA_VOCAB_TYPE_BPE) {
+ add_space_prefix = false;
+ clean_spaces = true;
+ if (tokenizer_pre.empty()) {
+ LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
+ LLAMA_LOG_WARN("%s: \n", __func__);
+ LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
+ LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
+ LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
+ LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
+ LLAMA_LOG_WARN("%s: \n", __func__);
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+ } else if (tokenizer_pre == "default") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+ } else if (
+ tokenizer_pre == "llama3" ||
+ tokenizer_pre == "llama-v3" ||
+ tokenizer_pre == "llama-bpe"||
+ tokenizer_pre == "falcon3") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
+ ignore_merges = true;
+ add_bos = true;
+ } else if (
+ tokenizer_pre == "deepseek-llm") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "deepseek-coder") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "deepseek-v3") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "falcon") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON;
+ } else if (
+ tokenizer_pre == "mpt") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_MPT;
+ } else if (
+ tokenizer_pre == "starcoder") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_STARCODER;
+ } else if (
+ tokenizer_pre == "gpt-2" ||
+ tokenizer_pre == "phi-2" ||
+ tokenizer_pre == "jina-es" ||
+ tokenizer_pre == "jina-de" ||
+ tokenizer_pre == "gigachat" ||
+ tokenizer_pre == "jina-v1-en" ||
+ tokenizer_pre == "jina-v2-es" ||
+ tokenizer_pre == "jina-v2-de" ||
+ tokenizer_pre == "jina-v2-code" ||
+ tokenizer_pre == "roberta-bpe") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
+ } else if (
+ tokenizer_pre == "refact") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
+ } else if (
+ tokenizer_pre == "command-r") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "qwen2") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "stablelm2") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
+ } else if (
+ tokenizer_pre == "olmo") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_OLMO;
+ } else if (
+ tokenizer_pre == "dbrx") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DBRX;
+ } else if (
+ tokenizer_pre == "smaug-bpe") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_SMAUG;
+ } else if (
+ tokenizer_pre == "poro-chat") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_PORO;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "chatglm-bpe") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
+ special_bos_id = LLAMA_TOKEN_NULL;
+ } else if (
+ tokenizer_pre == "viking") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_VIKING;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "jais") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS;
+ } else if (
+ tokenizer_pre == "tekken") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
+ clean_spaces = false;
+ ignore_merges = true;
+ add_bos = true;
+ } else if (
+ tokenizer_pre == "smollm") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "codeshell") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
+ } else if (
+ tokenizer_pre == "bloom") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_BLOOM;
+ } else if (
+ tokenizer_pre == "gpt3-finnish") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
+ } else if (
+ tokenizer_pre == "exaone") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE;
+ } else if (
+ tokenizer_pre == "chameleon") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
+ add_bos = true;
+ clean_spaces = false;
+ } else if (
+ tokenizer_pre == "minerva-7b") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_MINERVA;
+ } else if (
+ tokenizer_pre == "megrez") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
+ } else {
+ throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
+ }
+ } else if (type == LLAMA_VOCAB_TYPE_SPM) {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+ add_space_prefix = true;
+ clean_spaces = false;
+ add_bos = true;
+ add_eos = false;
+ } else if (type == LLAMA_VOCAB_TYPE_WPM) {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+ add_space_prefix = false;
+ clean_spaces = true;
+ add_bos = true;
+ add_eos = false;
+ } else if (type == LLAMA_VOCAB_TYPE_UGM) {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+ add_bos = false;
+ add_eos = true;
+ } else if (type == LLAMA_VOCAB_TYPE_RWKV) {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+ add_space_prefix = false;
+ clean_spaces = false;
+ add_bos = false;
+ add_eos = false;
+ } else {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+ }
+
+ ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false);
+ ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false);
+ }
+
+ const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
+ if (token_idx == -1) {
+ throw std::runtime_error("cannot find tokenizer vocab in model file\n");
+ }
+
+ const float * scores = nullptr;
+ const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
+ if (score_idx != -1) {
+ scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
+ }
+
+ const int * toktypes = nullptr;
+ const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
+ if (toktype_idx != -1) {
+ toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
+ }
+
+ uint32_t n_tokens = gguf_get_arr_n(ctx, token_idx);
+ id_to_token.resize(n_tokens);
+
+ for (uint32_t i = 0; i < n_tokens; i++) {
+ std::string word = gguf_get_arr_str(ctx, token_idx, i);
+ if (word.empty()) {
+ LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i);
+ word = "[EMPTY_" + std::to_string(i) + "]";
+ }
+
+ token_to_id[word] = i;
+ max_token_len = std::max(max_token_len, (int) word.size());
+
+ auto & token_data = id_to_token[i];
+ token_data.text = std::move(word);
+ token_data.score = scores ? scores[i] : 0.0f;
+ token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
+
+ if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
+ switch(toktypes[i]) {
+ case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
+ case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
+ case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
+ case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
+ case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
+ case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
+ case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
+ default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
+ }
+ }
+ }
+ GGML_ASSERT(id_to_token.size() == token_to_id.size());
+
+ init_tokenizer(type);
+
+ // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
+ if (type == LLAMA_VOCAB_TYPE_SPM) {
+ try {
+ linefeed_id = vocab.byte_to_token('\n');
+ } catch (const std::exception & e) {
+ LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
+ linefeed_id = special_pad_id;
+ }
+ } else if (type == LLAMA_VOCAB_TYPE_WPM) {
+ linefeed_id = special_pad_id;
+ } else if (type == LLAMA_VOCAB_TYPE_RWKV) {
+ const std::vector<int> ids = tokenize("\n", false);
+ GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
+ linefeed_id = ids[0];
+ } else {
+ const std::vector<int> ids = tokenize("\xC4\x8A", false); // U+010A
+
+ //GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
+ if (ids.empty()) {
+ LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__);
+ linefeed_id = special_pad_id;
+ } else {
+ linefeed_id = ids[0];
+ }
+ }
+
+ // special tokens
+ {
+ const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
+ { LLM_KV_TOKENIZER_BOS_ID, special_bos_id },
+ { LLM_KV_TOKENIZER_EOS_ID, special_eos_id },
+ { LLM_KV_TOKENIZER_EOT_ID, special_eot_id },
+ { LLM_KV_TOKENIZER_EOM_ID, special_eom_id },
+ { LLM_KV_TOKENIZER_UNK_ID, special_unk_id },
+ { LLM_KV_TOKENIZER_SEP_ID, special_sep_id },
+ { LLM_KV_TOKENIZER_PAD_ID, special_pad_id },
+ { LLM_KV_TOKENIZER_MASK_ID, special_mask_id },
+ { LLM_KV_TOKENIZER_FIM_PRE_ID, special_fim_pre_id },
+ { LLM_KV_TOKENIZER_FIM_SUF_ID, special_fim_suf_id },
+ { LLM_KV_TOKENIZER_FIM_MID_ID, special_fim_mid_id },
+ { LLM_KV_TOKENIZER_FIM_PAD_ID, special_fim_pad_id },
+ { LLM_KV_TOKENIZER_FIM_REP_ID, special_fim_rep_id },
+ { LLM_KV_TOKENIZER_FIM_SEP_ID, special_fim_sep_id },
+
+ // deprecated
+ { LLM_KV_TOKENIZER_PREFIX_ID, special_fim_pre_id },
+ { LLM_KV_TOKENIZER_SUFFIX_ID, special_fim_suf_id },
+ { LLM_KV_TOKENIZER_MIDDLE_ID, special_fim_mid_id },
+ };
+
+ for (const auto & it : special_token_types) {
+ const std::string & key = kv(std::get<0>(it));
+ int32_t & id = std::get<1>(it);
+
+ uint32_t new_id;
+ if (!ml.get_key(std::get<0>(it), new_id, false)) {
+ continue;
+ }
+ if (new_id >= id_to_token.size()) {
+ LLAMA_LOG_WARN("%s: bad special token: '%s' = %u, using default id %d\n",
+ __func__, key.c_str(), new_id, id);
+ } else {
+ id = new_id;
+ }
+ }
+
+ // Handle add_bos and add_eos
+ {
+ bool temp = true;
+
+ if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
+ add_bos = temp;
+ }
+ if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
+ add_eos = temp;
+ }
+ }
- if (add_special && vocab.tokenizer_add_eos) {
- GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL);
- output.push_back(vocab.special_eos_id);
+ // auto-detect special tokens by text
+ // TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
+ // for now, we apply this workaround to find the tokens based on their text
+
+ for (const auto & t : token_to_id) {
+ // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
+ if (special_eot_id == LLAMA_TOKEN_NULL) {
+ if (false
+ || t.first == "<|eot_id|>"
+ || t.first == "<|im_end|>"
+ || t.first == "<|end|>"
+ || t.first == "<end_of_turn>"
+ || t.first == "<|endoftext|>"
+ || t.first == "<EOT>"
+ || t.first == "<|end▁of▁sentence|>" // DeepSeek
+ ) {
+ special_eot_id = t.second;
+ if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+ __func__, t.second, t.first.c_str());
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+ }
}
- } break;
- case LLAMA_VOCAB_TYPE_RWKV:
- {
- llm_tokenizer_rwkv_session session(vocab);
- for (const auto & fragment : fragment_buffer) {
- if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
- auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
+ }
-#ifdef PRETOKENIZERDEBUG
- LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
-#endif
+ // find EOM token: "<|eom_id|>"
+ if (special_eom_id == LLAMA_TOKEN_NULL) {
+ if (false
+ || t.first == "<|eom_id|>"
+ ) {
+ special_eom_id = t.second;
+ if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+ __func__, t.second, t.first.c_str());
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+ }
+ }
+ }
- session.tokenize(raw_text, output);
- } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
- output.push_back(fragment.token);
+ // find FIM_PRE token: "<|fim_prefix|>", "<fim-prefix>", "<PRE>", etc.
+ if (special_fim_pre_id == LLAMA_TOKEN_NULL) {
+ if (false
+ || t.first == "<|fim_prefix|>" // Qwen
+ || t.first == "<fim-prefix>"
+ || t.first == "<|fim▁begin|>" // DeepSeek
+ || t.first == "<PRE>"
+ ) {
+ special_fim_pre_id = t.second;
+ if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+ __func__, t.second, t.first.c_str());
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
}
- } break;
- case LLAMA_VOCAB_TYPE_NONE:
- GGML_ABORT("fatal error");
+ }
+
+ // find FIM_SUF token: "<|fim_suffix|>", "<fim-suffix>", "<SUF>", etc.
+ if (special_fim_suf_id == LLAMA_TOKEN_NULL) {
+ if (false
+ || t.first == "<|fim_suffix|>" // Qwen
+ || t.first == "<fim-suffix>"
+ || t.first == "<|fim▁hole|>" // DeepSeek
+ || t.first == "<SUF>"
+ ) {
+ special_fim_suf_id = t.second;
+ if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+ __func__, t.second, t.first.c_str());
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+ }
+ }
+ }
+
+ // find FIM_MID token: "<|fim_middle|>", "<fim-middle>", "<MID>", etc.
+ if (special_fim_mid_id == LLAMA_TOKEN_NULL) {
+ if (false
+ || t.first == "<|fim_middle|>" // Qwen
+ || t.first == "<fim-middle>"
+ || t.first == "<|fim▁end|>" // DeepSeek
+ || t.first == "<MID>"
+ ) {
+ special_fim_mid_id = t.second;
+ if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+ __func__, t.second, t.first.c_str());
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+ }
+ }
+ }
+
+ // find FIM_PAD token: "<|fim_pad|>", "<fim-pad>", "<PAD>", etc.
+ if (special_fim_pad_id == LLAMA_TOKEN_NULL) {
+ if (false
+ || t.first == "<|fim_pad|>" // Qwen
+ || t.first == "<fim-pad>"
+ || t.first == "<PAD>"
+ ) {
+ special_fim_pad_id = t.second;
+ if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+ __func__, t.second, t.first.c_str());
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+ }
+ }
+ }
+
+ // find FIM_REP token: "<|fim_repo|>", "<fim-repo>", "<REP>", etc.
+ if (special_fim_rep_id == LLAMA_TOKEN_NULL) {
+ if (false
+ || t.first == "<|fim_repo|>" // Qwen
+ || t.first == "<|repo_name|>"
+ || t.first == "<fim-repo>"
+ || t.first == "<REPO>"
+ ) {
+ special_fim_rep_id = t.second;
+ if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+ __func__, t.second, t.first.c_str());
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+ }
+ }
+ }
+
+ // find FIM_SEP token: "<|file_sep|>"
+ if (special_fim_sep_id == LLAMA_TOKEN_NULL) {
+ if (false
+ || t.first == "<|file_sep|>" // Qwen
+ ) {
+ special_fim_sep_id = t.second;
+ if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+ __func__, t.second, t.first.c_str());
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+ }
+ }
+ }
+ }
+
+ // maintain a list of tokens that cause end-of-generation
+ // this is currently determined based on the token text, which is obviously not ideal
+ // ref: https://github.com/ggerganov/llama.cpp/issues/9606
+ special_eog_ids.clear();
+
+ if (special_fim_pad_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_pad_id) == 0) {
+ special_eog_ids.insert(special_fim_pad_id);
+ }
+
+ if (special_fim_rep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_rep_id) == 0) {
+ special_eog_ids.insert(special_fim_rep_id);
+ }
+
+ if (special_fim_sep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_sep_id) == 0) {
+ special_eog_ids.insert(special_fim_sep_id);
+ }
+
+ for (const auto & t : token_to_id) {
+ if (false
+ || t.first == "<|eot_id|>"
+ || t.first == "<|im_end|>"
+ || t.first == "<|end|>"
+ || t.first == "<end_of_turn>"
+ || t.first == "<|endoftext|>"
+ || t.first == "<|eom_id|>"
+ || t.first == "<EOT>"
+ ) {
+ special_eog_ids.insert(t.second);
+ if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+ __func__, t.second, t.first.c_str());
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+ }
+ } else {
+ // token is control, but not marked as EOG -> print a debug log
+ if (id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(t.second) == 0) {
+ LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
+ __func__, t.second, t.first.c_str());
+ }
+ }
+ }
+
+ // sanity checks
+ if (special_eos_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eos_id) == 0) {
+ special_eog_ids.insert(special_eos_id);
+ LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
+ }
+
+ if (special_eot_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eot_id) == 0) {
+ special_eog_ids.insert(special_eot_id);
+ LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
+ }
+
+ if (special_eom_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eom_id) == 0) {
+ special_eog_ids.insert(special_eom_id);
+ LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
+ }
}
- return output;
-}
+ // build special tokens cache
+ {
+ for (llama_token id = 0; id < (llama_token) n_tokens; ++id) {
+ if (id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) {
+ cache_special_tokens.push_back(id);
+ }
+ }
-llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch) {
- GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
- static const char * hex = "0123456789ABCDEF";
- switch (llama_vocab_get_type(vocab)) {
- case LLAMA_VOCAB_TYPE_SPM:
- case LLAMA_VOCAB_TYPE_UGM: {
- const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
- auto token = vocab.token_to_id.find(buf);
- if (token != vocab.token_to_id.end()) {
- return (*token).second;
+ std::sort(cache_special_tokens.begin(), cache_special_tokens.end(),
+ [&] (const llama_token a, const llama_token b) {
+ return id_to_token[a].text.size() > id_to_token[b].text.size();
}
- // Try to fall back to just the byte as a string
- const char buf2[2] = { (char)ch, 0 };
- return vocab.token_to_id.at(buf2);
+ );
+
+ LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t) cache_special_tokens.size());
+ }
+
+ // build token to piece cache
+ {
+ size_t size_cache = 0;
+
+ std::vector<std::string> cache(n_tokens);
+
+ for (uint32_t id = 0; id < n_tokens; ++id) {
+ cache[id] = token_to_piece_for_cache(id, true);
+
+ size_cache += cache[id].size();
}
- case LLAMA_VOCAB_TYPE_WPM:
- case LLAMA_VOCAB_TYPE_BPE: {
- return vocab.token_to_id.at(unicode_byte_to_utf8(ch));
+
+ std::swap(cache_token_to_piece, cache);
+
+ LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
+ }
+
+ // Handle per token attributes
+ //NOTE: Each model customizes per token attributes.
+ //NOTE: Per token attributes are missing from the GGUF file.
+ //TODO: Extract attributes from GGUF file.
+ {
+ auto _contains_any = [] (const std::string & str, const std::vector<std::string> & substrs) -> bool {
+ for (const auto & substr : substrs) {
+ if (str.find(substr) < std::string::npos) {
+ return true;
+ }
+ }
+ return false;
+ };
+
+ auto _set_tokenid_attr = [&] (const llama_token id, llama_token_attr attr, bool value) {
+ uint32_t current = id_to_token.at(id).attr;
+ current = value ? (current | attr) : (current & ~attr);
+ id_to_token[id].attr = (llama_token_attr) current;
+ };
+
+ auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
+ _set_tokenid_attr(token_to_id.at(token), attr, value);
+ };
+
+ std::string model_name;
+ std::string tokenizer_pre;
+
+ ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
+ ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
+
+ // model name to lowercase
+ std::transform(model_name.begin(), model_name.end(), model_name.begin(),
+ [] (const std::string::value_type x) {
+ return std::tolower(x);
+ }
+ );
+
+ // set attributes by model/tokenizer name
+ if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
+ _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
+ } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
+ for (auto id : cache_special_tokens) {
+ _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
+ }
+ for (const auto * token : {"</s>"}) {
+ _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
+ }
+ for (const auto * token : {"<unk>", "<s>", "<|endoftext|>"}) {
+ _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
+ }
}
- default:
- GGML_ABORT("fatal error");
}
}
-const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token) {
- GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
- return vocab.id_to_token[token].text.c_str();
+enum llama_vocab_type llama_vocab::impl::get_type() const {
+ return type;
}
-float llama_token_get_score_impl(const struct llama_vocab & vocab, llama_token token) {
- GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
- return vocab.id_to_token[token].score;
+std::string llama_vocab::impl::type_name() const{
+ switch (type) {
+ case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
+ case LLAMA_VOCAB_TYPE_SPM: return "SPM";
+ case LLAMA_VOCAB_TYPE_BPE: return "BPE";
+ case LLAMA_VOCAB_TYPE_WPM: return "WPM";
+ case LLAMA_VOCAB_TYPE_UGM: return "UGM";
+ case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
+ default: return "unknown";
+ }
}
-llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, llama_token token) {
- GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
- return vocab.id_to_token[token].attr;
+bool llama_vocab::impl::is_normal(llama_token id) const {
+ GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
+ return id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
}
-bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
- return token != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(token) > 0;
+bool llama_vocab::impl::is_unknown(llama_token id) const {
+ GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
+ return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
}
-bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token) {
- return llama_is_control_token(vocab, token);
+bool llama_vocab::impl::is_control(llama_token id) const {
+ GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
+ return id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
}
-llama_token llama_token_bos_impl(const struct llama_vocab & vocab) {
- return vocab.type != LLAMA_VOCAB_TYPE_WPM ? vocab.special_bos_id : vocab.special_cls_id;
+bool llama_vocab::impl::is_byte(llama_token id) const {
+ GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
+ return id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
}
-llama_token llama_token_eos_impl(const struct llama_vocab & vocab) {
- return vocab.special_eos_id;
+bool llama_vocab::impl::is_user_defined(llama_token id) const {
+ GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
+ return id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
}
-llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
- return vocab.special_eot_id;
+bool llama_vocab::impl::is_unused(llama_token id) const {
+ GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
+ return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED;
}
-llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
- return vocab.special_eom_id;
+bool llama_vocab::impl::is_eog(llama_token id) const {
+ return id != LLAMA_TOKEN_NULL && special_eog_ids.count(id) > 0;
}
-llama_token llama_token_cls_impl(const struct llama_vocab & vocab) {
- return vocab.special_cls_id;
+uint8_t llama_vocab::impl::token_to_byte(llama_token id) const {
+ GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE);
+ GGML_ASSERT(is_byte(id));
+ const auto & token_data = id_to_token.at(id);
+ switch (get_type()) {
+ case LLAMA_VOCAB_TYPE_SPM:
+ case LLAMA_VOCAB_TYPE_UGM: {
+ auto buf = token_data.text.substr(3, 2);
+ return strtol(buf.c_str(), NULL, 16);
+ }
+ case LLAMA_VOCAB_TYPE_BPE: {
+ GGML_ABORT("fatal error");
+ }
+ case LLAMA_VOCAB_TYPE_WPM: {
+ GGML_ABORT("fatal error");
+ }
+ default:
+ GGML_ABORT("fatal error");
+ }
}
-llama_token llama_token_sep_impl(const struct llama_vocab & vocab) {
- return vocab.special_sep_id;
+llama_token_attr llama_vocab::impl::token_get_attr(llama_token id) const {
+ GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
+ return id_to_token.at(id).attr;
}
-llama_token llama_token_nl_impl(const struct llama_vocab & vocab) {
- return vocab.linefeed_id;
-}
+void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) {
+ LLAMA_LOG_DEBUG("%s: initializing tokenizer for type %d\n", __func__, type);
-llama_token llama_token_pad_impl(const struct llama_vocab & vocab) {
- return vocab.special_pad_id;
+ switch (type) {
+ case LLAMA_VOCAB_TYPE_SPM:
+ tokenizer = std::make_unique<llm_tokenizer_spm>(vocab);
+ break;
+ case LLAMA_VOCAB_TYPE_BPE:
+ tokenizer = std::make_unique<llm_tokenizer_bpe>(vocab);
+ break;
+ case LLAMA_VOCAB_TYPE_WPM:
+ tokenizer = std::make_unique<llm_tokenizer_wpm>(vocab);
+ break;
+ case LLAMA_VOCAB_TYPE_UGM:
+ tokenizer = std::make_unique<llm_tokenizer_ugm>(vocab, precompiled_charsmap);
+ break;
+ case LLAMA_VOCAB_TYPE_RWKV:
+ tokenizer = std::make_unique<llm_tokenizer_rwkv>(vocab);
+ break;
+ default:
+ GGML_ABORT("unsupported vocab type");
+ }
}
-bool llama_add_bos_token_impl(const struct llama_vocab & vocab) {
- return vocab.tokenizer_add_bos;
-}
+//
+// (de-) tokenize
+//
-bool llama_add_eos_token_impl(const struct llama_vocab & vocab) {
- return vocab.tokenizer_add_eos;
-}
+// #define PRETOKENIZERDEBUG
-llama_token llama_token_prefix_impl(const struct llama_vocab & vocab) {
- return vocab.special_fim_pre_id;
-}
+void llama_vocab::impl::tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const {
+ // for each special token
+ for (const llama_token special_id : cache_special_tokens) {
+ const auto & data = vocab.get_token_data(special_id);
+ const auto & text = data.text;
-llama_token llama_token_middle_impl(const struct llama_vocab & vocab) {
- return vocab.special_fim_mid_id;
-}
+ if (!parse_special && (data.attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_UNKNOWN))) {
+ // Ignore control and unknown tokens when parse_special == false
+ continue;
+ // User-defined tokens are still pre-tokenized before everything else
+ // ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726
+ // This is mostly relevant for neox-style tokenizers (mpt, olmo, stablelm, etc.)
+ }
-llama_token llama_token_suffix_impl(const struct llama_vocab & vocab) {
- return vocab.special_fim_suf_id;
-}
+ // for each text fragment
+ std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
+ while (it != buffer.end()) {
+ auto & fragment = (*it);
-llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab) {
- return vocab.special_fim_pre_id;
-}
+ // if a fragment is text ( not yet processed )
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
+ const auto & raw_text = fragment.raw_text;
-llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab) {
- return vocab.special_fim_suf_id;
-}
+ auto raw_text_base_offset = fragment.offset;
+ auto raw_text_base_length = fragment.length;
-llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab) {
- return vocab.special_fim_mid_id;
-}
+ // loop over the text
+ while (true) {
+ // find the first occurrence of a given special token in this fragment
+ // passing offset argument only limit the "search area" but match coordinates
+ // are still relative to the source full raw_text
+ auto match = raw_text.find(text, raw_text_base_offset);
-llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab) {
- return vocab.special_fim_pad_id;
-}
+ // no occurrences found, stop processing this fragment for a given special token
+ if (match == std::string::npos) break;
-llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab) {
- return vocab.special_fim_rep_id;
-}
+ // check if match is within bounds of offset <-> length
+ if (match + text.length() > raw_text_base_offset + raw_text_base_length) break;
-llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab) {
- return vocab.special_fim_sep_id;
-}
+#ifdef PRETOKENIZERDEBUG
+ LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
+#endif
+ auto source = std::distance(buffer.begin(), it);
-int32_t llama_tokenize_impl(
- const struct llama_vocab & vocab,
- const char * text,
- int32_t text_len,
- llama_token * tokens,
- int32_t n_tokens_max,
- bool add_special,
- bool parse_special) {
- auto res = llama_tokenize_internal(vocab, std::string(text, text_len), add_special, parse_special);
- if (n_tokens_max < (int) res.size()) {
- // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
- return -((int) res.size());
+ // if match is further than base offset
+ // then we have some text to the left of it
+ if (match > raw_text_base_offset) {
+ // left
+ const int64_t left_reminder_offset = raw_text_base_offset + 0;
+ int64_t left_reminder_length = match - raw_text_base_offset;
+
+ if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
+ while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
+ left_reminder_length--;
+ }
+ }
+
+ if (left_reminder_length > 0) {
+ buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
+ it++;
+ }
+
+#ifdef PRETOKENIZERDEBUG
+ LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
+#endif
+ }
+
+ // special token
+ buffer.emplace_after(it, special_id);
+ it++;
+
+ // right
+ if (match + text.length() < raw_text_base_offset + raw_text_base_length) {
+ int64_t right_reminder_offset = match + text.length();
+ int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + text.length());
+
+ if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
+ while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
+ right_reminder_offset++;
+ right_reminder_length--;
+ }
+ }
+
+ if (right_reminder_length > 0) {
+ buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
+ it++;
+ }
+
+#ifdef PRETOKENIZERDEBUG
+ LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
+#endif
+
+ if (source == 0) {
+ buffer.erase_after(buffer.before_begin());
+ } else {
+ buffer.erase_after(std::next(buffer.begin(), (source - 1)));
+ }
+
+ // repeat for the right side
+ raw_text_base_offset = right_reminder_offset;
+ raw_text_base_length = right_reminder_length;
+
+#ifdef PRETOKENIZERDEBUG
+ LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
+#endif
+ } else {
+ if (source == 0) {
+ buffer.erase_after(buffer.before_begin());
+ } else {
+ buffer.erase_after(std::next(buffer.begin(), (source - 1)));
+ }
+ break;
+ }
+ }
+ }
+ it++;
+ }
}
+}
- for (size_t i = 0; i < res.size(); i++) {
- tokens[i] = res[i];
+// NOTE: avoid ever using this except for building the token_to_piece caches
+std::string llama_vocab::impl::token_to_piece_for_cache(llama_token token, bool special) const {
+ std::string piece;
+ piece.resize(piece.capacity()); // using string internal cache
+ const int n_chars = vocab.token_to_piece(token, &piece[0], piece.size(), 0, special);
+ if (n_chars < 0) {
+ piece.resize(-n_chars);
+ int check = vocab.token_to_piece(token, &piece[0], piece.size(), 0, special);
+ GGML_ASSERT(check == -n_chars);
+ }
+ else {
+ piece.resize(n_chars);
}
- return res.size();
+ return piece;
+}
+
+static void llama_escape_whitespace(std::string & text) {
+ replace_all(text, " ", "\xe2\x96\x81");
+}
+
+static void llama_unescape_whitespace(std::string & word) {
+ replace_all(word, "\xe2\x96\x81", " ");
}
static std::string llama_decode_text(const std::string & text) {
return decoded_text;
}
-// does not write null-terminator to buf
-int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) {
+std::vector<llama_token> llama_vocab::impl::tokenize(
+ const std::string & raw_text,
+ bool add_special,
+ bool parse_special) const {
+ GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
+
+ std::vector<llama_token> output;
+ std::forward_list<fragment_buffer_variant> fragment_buffer;
+
+ if (!raw_text.empty()) {
+ fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
+ tokenizer_st_partition(fragment_buffer, parse_special);
+ }
+
+ switch (get_type()) {
+ case LLAMA_VOCAB_TYPE_SPM:
+ {
+ // OG tokenizer behavior:
+ //
+ // tokenizer.encode('', add_special_tokens=True) returns [1]
+ // tokenizer.encode('', add_special_tokens=False) returns []
+
+ bool is_prev_special = true; // prefix with space if first token
+
+ if (add_special && add_bos) {
+ GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
+ output.push_back(special_bos_id);
+ is_prev_special = true;
+ }
+
+ for (const auto & fragment : fragment_buffer) {
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
+ std::string text;
+
+ // prefix with space if previous is special
+ if (add_space_prefix && is_prev_special) {
+ text = ' ';
+ }
+
+ text += fragment.raw_text.substr(fragment.offset, fragment.length);
+
+#ifdef PRETOKENIZERDEBUG
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
+#endif
+ llama_escape_whitespace(text);
+ llm_tokenizer_spm_session session(vocab);
+ session.tokenize(text, output);
+ is_prev_special = false;
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
+ output.push_back(fragment.token);
+ is_prev_special = true;
+ }
+ }
+
+ if (add_special && add_bos && output.size() >= 2 && output[1] == special_bos_id) {
+ LLAMA_LOG_WARN(
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
+ "Are you sure this is what you want?\n", __FUNCTION__);
+ }
+
+ if (add_special && add_eos) {
+ GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL);
+ output.push_back(special_eos_id);
+ }
+ } break;
+ case LLAMA_VOCAB_TYPE_BPE:
+ {
+ llm_tokenizer_bpe_session session(vocab, *static_cast<const llm_tokenizer_bpe *>(tokenizer.get()));
+ // it calls some other methods that are not exist in llm_tokenizer,
+ // here just cast it to bpe tokenizer object
+ if (add_special) {
+ session.append_bos(output);
+ }
+ for (const auto & fragment : fragment_buffer) {
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
+ std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
+
+#ifdef PRETOKENIZERDEBUG
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
+#endif
+ session.tokenize(text, output);
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
+ session.append(fragment.token, output);
+ }
+ }
+
+ if (add_special) {
+ session.append_eos(output);
+ session.check_double_bos_eos(output);
+ }
+ } break;
+ case LLAMA_VOCAB_TYPE_WPM:
+ {
+ if (add_special) {
+ GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
+ output.push_back(special_bos_id);
+ }
+
+ llm_tokenizer_wpm_session session(vocab);
+
+ for (const auto & fragment : fragment_buffer) {
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
+ std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
+
+#ifdef PRETOKENIZERDEBUG
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
+#endif
+ session.tokenize(text, output);
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
+ output.push_back(fragment.token);
+ }
+ }
+
+ if (add_special) {
+ GGML_ASSERT(special_sep_id != LLAMA_TOKEN_NULL);
+ output.push_back(special_sep_id);
+ }
+ } break;
+ case LLAMA_VOCAB_TYPE_UGM:
+ {
+ if (add_special && add_bos) {
+ GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
+ output.push_back(special_bos_id);
+ }
+ llm_tokenizer_ugm_session session(vocab, *static_cast<const llm_tokenizer_ugm *>(tokenizer.get()));
+
+ for (const auto & fragment : fragment_buffer) {
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
+ std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
+#ifdef PRETOKENIZERDEBUG
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
+#endif
+ session.tokenize(text, output);
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
+ output.push_back(fragment.token);
+ }
+ }
+
+ if (add_special && add_bos && output.size() >= 2 && output[1] == special_bos_id) {
+ LLAMA_LOG_WARN(
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
+ "Are you sure this is what you want?\n", __FUNCTION__);
+ }
+
+ if (add_special && add_eos) {
+ GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL);
+ output.push_back(special_eos_id);
+ }
+ } break;
+ case LLAMA_VOCAB_TYPE_RWKV:
+ {
+ llm_tokenizer_rwkv_session session(vocab, *static_cast<const llm_tokenizer_rwkv *>(tokenizer.get()));
+ for (const auto & fragment : fragment_buffer) {
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
+ std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
+
+#ifdef PRETOKENIZERDEBUG
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
+#endif
+
+ session.tokenize(text, output);
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
+ output.push_back(fragment.token);
+ }
+ }
+ } break;
+ case LLAMA_VOCAB_TYPE_NONE:
+ GGML_ABORT("fatal error");
+ }
+
+ return output;
+}
+
+int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
// ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
- const llama_token_attr attr = llama_token_get_attr_impl(vocab, token);
+ const llama_token_attr attr = token_get_attr(token);
if (!special && (attr & attr_special)) {
return 0;
}
// if we have a cache - use it
{
- const auto & cache = vocab.cache_token_to_piece;
+ const auto & cache = cache_token_to_piece;
if (!cache.empty()) {
const auto & result = cache.at(token);
}
}
- if (0 <= token && token < (int32_t) vocab.id_to_token.size()) {
- const std::string & token_text = vocab.id_to_token[token].text;
- switch (llama_vocab_get_type(vocab)) {
+ if (0 <= token && token < (int32_t) id_to_token.size()) {
+ const std::string & token_text = id_to_token[token].text;
+ switch (get_type()) {
case LLAMA_VOCAB_TYPE_WPM:
case LLAMA_VOCAB_TYPE_SPM:
case LLAMA_VOCAB_TYPE_UGM: {
return _try_copy(result.data(), result.size());
}
if (attr & LLAMA_TOKEN_ATTR_BYTE) {
- char byte = (char) llama_token_to_byte(vocab, token);
+ char byte = (char) token_to_byte(token);
return _try_copy((char*) &byte, 1);
}
break;
return 0;
}
-int32_t llama_detokenize_impl(
- const struct llama_vocab & vocab,
+const std::string & llama_vocab::impl::token_to_piece(llama_token token) const {
+ return cache_token_to_piece.at(token);
+}
+
+int32_t llama_vocab::impl::detokenize(
const llama_token * tokens,
int32_t n_tokens,
char * text,
int32_t text_len_max,
bool remove_special,
- bool unparse_special) {
- if (vocab.type == LLAMA_VOCAB_TYPE_NONE) {
+ bool unparse_special) const {
+ if (type == LLAMA_VOCAB_TYPE_NONE) {
return 0;
}
- GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
+ GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
int32_t avail = text_len_max;
int32_t total = 0;
// remove the leading space
- bool remove_space = vocab.tokenizer_add_space_prefix;
+ bool remove_space = add_space_prefix;
- if (remove_special && vocab.tokenizer_add_bos) {
- if (n_tokens > 0 && tokens[0] == vocab.special_bos_id) {
+ if (remove_special && add_bos) {
+ if (n_tokens > 0 && tokens[0] == special_bos_id) {
remove_space = false;
n_tokens--;
tokens++;
}
}
- if (remove_special && vocab.tokenizer_add_eos) {
- if (n_tokens > 0 && tokens[n_tokens - 1] == vocab.special_eos_id) {
+ if (remove_special && add_eos) {
+ if (n_tokens > 0 && tokens[n_tokens - 1] == special_eos_id) {
n_tokens--;
}
}
for (int32_t i = 0; i < n_tokens; ++i) {
GGML_ASSERT(avail >= 0);
- int32_t n_chars = llama_token_to_piece_impl(vocab, tokens[i], text, avail, remove_space, unparse_special);
+ int32_t n_chars = token_to_piece(tokens[i], text, avail, remove_space, unparse_special);
remove_space = false;
if (n_chars < 0) {
avail = 0;
return -total;
}
- if (vocab.tokenizer_clean_spaces) {
+ if (clean_spaces) {
text -= total; // restart text
// first pass: characters ?!., //TODO: where do these characters come from?
return total <= text_len_max ? total : -total;
}
-std::string llama_detokenize(const struct llama_vocab & vocab, const std::vector<llama_token> & tokens, bool special) {
+void llama_vocab::impl::print_info() const {
+ LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, type_name().c_str());
+ LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, vocab.n_tokens());
+ LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
+
+ // special tokens
+ if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token[special_bos_id].text.c_str() ); }
+ if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token[special_eos_id].text.c_str() ); }
+ if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token[special_eot_id].text.c_str() ); }
+ if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token[special_eom_id].text.c_str() ); }
+ if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token[special_unk_id].text.c_str() ); }
+ if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token[special_sep_id].text.c_str() ); }
+ if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token[special_pad_id].text.c_str() ); }
+ if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token[special_mask_id].text.c_str() ); }
+
+ if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token[linefeed_id].text.c_str() ); }
+
+ if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token[special_fim_pre_id].text.c_str() ); }
+ if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token[special_fim_suf_id].text.c_str() ); }
+ if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token[special_fim_mid_id].text.c_str() ); }
+ if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token[special_fim_pad_id].text.c_str() ); }
+ if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token[special_fim_rep_id].text.c_str() ); }
+ if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token[special_fim_sep_id].text.c_str() ); }
+
+ for (const auto & id : special_eog_ids) {
+ LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token[id].text.c_str() );
+ }
+
+ LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
+}
+
+llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
+}
+
+llama_vocab::~llama_vocab() {
+}
+
+void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
+ pimpl->load(ml, kv);
+}
+
+enum llama_vocab_type llama_vocab::get_type() const {
+ return pimpl->type;
+}
+
+enum llama_vocab_pre_type llama_vocab::get_pre_type() const {
+ return pimpl->pre_type;
+}
+
+uint32_t llama_vocab::n_tokens() const {
+ return (uint32_t) pimpl->id_to_token.size();
+}
+
+uint32_t llama_vocab::n_token_types() const {
+ return (uint32_t) pimpl->n_token_types;
+}
+
+std::string llama_vocab::type_name() const{
+ return pimpl->type_name();
+}
+
+bool llama_vocab::is_normal(llama_token id) const {
+ return pimpl->is_normal(id);
+}
+
+bool llama_vocab::is_unknown(llama_token id) const {
+ return pimpl->is_unknown(id);
+}
+
+bool llama_vocab::is_control(llama_token id) const {
+ return pimpl->is_control(id);
+}
+
+bool llama_vocab::is_byte(llama_token id) const {
+ return pimpl->is_byte(id);
+}
+
+bool llama_vocab::is_user_defined(llama_token id) const {
+ return pimpl->is_user_defined(id);
+}
+
+bool llama_vocab::is_unused(llama_token id) const {
+ return pimpl->is_unused(id);
+}
+
+bool llama_vocab::is_eog(llama_token id) const {
+ return pimpl->is_eog(id);
+}
+
+uint8_t llama_vocab::token_to_byte(llama_token id) const {
+ return pimpl->token_to_byte(id);
+}
+
+llama_token llama_vocab::byte_to_token(uint8_t ch) const {
+ GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE);
+ static const char * hex = "0123456789ABCDEF";
+ switch (get_type()) {
+ case LLAMA_VOCAB_TYPE_SPM:
+ case LLAMA_VOCAB_TYPE_UGM: {
+ const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
+ auto token = pimpl->token_to_id.find(buf);
+ if (token != pimpl->token_to_id.end()) {
+ return (*token).second;
+ }
+ // Try to fall back to just the byte as a string
+ const char buf2[2] = { (char)ch, 0 };
+ return pimpl->token_to_id.at(buf2);
+ }
+ case LLAMA_VOCAB_TYPE_WPM:
+ case LLAMA_VOCAB_TYPE_BPE: {
+ return pimpl->token_to_id.at(unicode_byte_to_utf8(ch));
+ }
+ default:
+ GGML_ABORT("fatal error");
+ }
+}
+
+llama_token llama_vocab::text_to_token(const std::string & text) const {
+ GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
+ auto it = pimpl->token_to_id.find(text);
+ if (it != pimpl->token_to_id.end()) {
+ return (*it).second;
+ }
+ return LLAMA_TOKEN_NULL;
+}
+
+const llama_vocab::token_data & llama_vocab::get_token_data(llama_token id) const {
+ GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
+ return pimpl->id_to_token.at(id);
+}
+
+const char * llama_vocab::token_get_text(llama_token id) const {
+ GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
+ return pimpl->id_to_token.at(id).text.c_str();
+}
+
+float llama_vocab::token_get_score(llama_token id) const {
+ GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
+ return pimpl->id_to_token.at(id).score;
+}
+
+llama_token_attr llama_vocab::token_get_attr(llama_token id) const {
+ return pimpl->token_get_attr(id);
+}
+
+llama_token llama_vocab::token_bos() const {
+ return pimpl->special_bos_id;
+}
+
+llama_token llama_vocab::token_eos() const {
+ return pimpl->special_eos_id;
+}
+
+llama_token llama_vocab::token_eot() const {
+ return pimpl->special_eot_id;
+}
+
+llama_token llama_vocab::token_eom() const {
+ return pimpl->special_eom_id;
+}
+
+llama_token llama_vocab::token_unk() const {
+ return pimpl->special_unk_id;
+}
+
+llama_token llama_vocab::token_sep() const {
+ return pimpl->special_sep_id;
+}
+
+llama_token llama_vocab::token_nl() const {
+ return pimpl->linefeed_id;
+}
+
+llama_token llama_vocab::token_pad() const {
+ return pimpl->special_pad_id;
+}
+
+llama_token llama_vocab::token_prefix() const {
+ return pimpl->special_fim_pre_id;
+}
+
+llama_token llama_vocab::token_middle() const {
+ return pimpl->special_fim_mid_id;
+}
+
+llama_token llama_vocab::token_suffix() const {
+ return pimpl->special_fim_suf_id;
+}
+
+llama_token llama_vocab::token_fim_pre() const {
+ return pimpl->special_fim_pre_id;
+}
+
+llama_token llama_vocab::token_fim_suf() const {
+ return pimpl->special_fim_suf_id;
+}
+
+llama_token llama_vocab::token_fim_mid() const {
+ return pimpl->special_fim_mid_id;
+}
+
+llama_token llama_vocab::token_fim_pad() const {
+ return pimpl->special_fim_pad_id;
+}
+
+llama_token llama_vocab::token_fim_rep() const {
+ return pimpl->special_fim_rep_id;
+}
+
+llama_token llama_vocab::token_fim_sep() const {
+ return pimpl->special_fim_sep_id;
+}
+
+bool llama_vocab::get_add_space_prefix() const {
+ return pimpl->add_space_prefix;
+}
+
+bool llama_vocab::get_add_bos() const {
+ return pimpl->add_bos;
+}
+
+bool llama_vocab::get_add_eos() const {
+ return pimpl->add_eos;
+}
+
+bool llama_vocab::get_ignore_merges() const {
+ return pimpl->ignore_merges;
+}
+
+bool llama_vocab::get_clean_spaces() const {
+ return pimpl->clean_spaces;
+}
+
+bool llama_vocab::get_remove_extra_whitespaces() const {
+ return pimpl->remove_extra_whitespaces;
+}
+
+bool llama_vocab::get_escape_whitespaces() const {
+ return pimpl->escape_whitespaces;
+}
+
+bool llama_vocab::get_treat_whitespace_as_suffix() const {
+ return pimpl->treat_whitespace_as_suffix;
+}
+
+int llama_vocab::max_token_len() const {
+ return pimpl->max_token_len;
+}
+
+int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
+ GGML_ASSERT(token_left.find(' ') == std::string::npos);
+ GGML_ASSERT(token_left.find('\n') == std::string::npos);
+ GGML_ASSERT(token_right.find(' ') == std::string::npos);
+ GGML_ASSERT(token_right.find('\n') == std::string::npos);
+
+ auto it = pimpl->bpe_ranks.find(std::make_pair(token_left, token_right));
+ if (it == pimpl->bpe_ranks.end()) {
+ return -1;
+ }
+
+ return it->second;
+}
+
+int32_t llama_vocab::tokenize(
+ const char * text,
+ int32_t text_len,
+ llama_token * tokens,
+ int32_t n_tokens_max,
+ bool add_special,
+ bool parse_special) const {
+ auto res = tokenize(std::string(text, text_len), add_special, parse_special);
+ if (n_tokens_max < (int) res.size()) {
+ // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
+ return -((int) res.size());
+ }
+
+ for (size_t i = 0; i < res.size(); i++) {
+ tokens[i] = res[i];
+ }
+
+ return res.size();
+}
+
+std::vector<llama_token> llama_vocab::tokenize(
+ const std::string & raw_text,
+ bool add_special,
+ bool parse_special) const {
+ return pimpl->tokenize(raw_text, add_special, parse_special);
+}
+
+const std::string & llama_vocab::token_to_piece(llama_token token) const {
+ return pimpl->token_to_piece(token);
+}
+
+int32_t llama_vocab::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
+ return pimpl->token_to_piece(token, buf, length, lstrip, special);
+}
+
+int32_t llama_vocab::detokenize(
+ const llama_token * tokens,
+ int32_t n_tokens,
+ char * text,
+ int32_t text_len_max,
+ bool remove_special,
+ bool unparse_special) const {
+ return pimpl->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
+}
+
+std::string llama_vocab::detokenize(const std::vector<llama_token> & tokens, bool special) const {
std::string text;
text.resize(std::max(text.capacity(), tokens.size()));
- int32_t n_chars = llama_detokenize_impl(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+ int32_t n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
if (n_chars < 0) {
text.resize(-n_chars);
- n_chars = llama_detokenize_impl(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
+ n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
}
// NOTE: the original tokenizer decodes bytes after collecting the pieces.
return text;
}
+
+void llama_vocab::print_info() const {
+ pimpl->print_info();
+}
+
+//
+// interface implementation
+//
+
+int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab) {
+ return vocab->n_tokens();
+}
+
+// deprecated
+int32_t llama_n_vocab(const struct llama_vocab * vocab) {
+ return llama_vocab_n_tokens(vocab);
+}
+
+enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab) {
+ return vocab->get_type();
+}
+
+const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token) {
+ return vocab->token_get_text(token);
+}
+
+float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token) {
+ return vocab->token_get_score(token);
+}
+
+enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token) {
+ return vocab->token_get_attr(token);
+}
+
+bool llama_vocab_is_eog(const struct llama_vocab * vocab, llama_token token) {
+ return vocab->is_eog(token);
+}
+
+bool llama_vocab_is_control(const struct llama_vocab * vocab, llama_token token) {
+ return vocab->is_control(token);
+}
+
+llama_token llama_vocab_bos(const struct llama_vocab * vocab) {
+ return vocab->token_bos();
+}
+
+llama_token llama_vocab_eos(const struct llama_vocab * vocab) {
+ return vocab->token_eos();
+}
+
+llama_token llama_vocab_eot(const struct llama_vocab * vocab) {
+ return vocab->token_eot();
+}
+
+// deprecated
+llama_token llama_vocab_cls(const struct llama_vocab * vocab) {
+ return vocab->token_bos();
+}
+
+llama_token llama_vocab_sep(const struct llama_vocab * vocab) {
+ return vocab->token_sep();
+}
+
+llama_token llama_vocab_nl (const struct llama_vocab * vocab) {
+ return vocab->token_nl();
+}
+
+llama_token llama_vocab_pad(const struct llama_vocab * vocab) {
+ return vocab->token_pad();
+}
+
+bool llama_vocab_get_add_bos(const struct llama_vocab * vocab) {
+ return vocab->get_add_bos();
+}
+
+bool llama_vocab_get_add_eos(const struct llama_vocab * vocab) {
+ return vocab->get_add_eos();
+}
+
+llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab) {
+ return vocab->token_fim_pre();
+}
+
+llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab) {
+ return vocab->token_fim_suf();
+}
+
+llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab) {
+ return vocab->token_fim_mid();
+}
+
+llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab) {
+ return vocab->token_fim_pad();
+}
+
+llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab) {
+ return vocab->token_fim_rep();
+}
+
+llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab) {
+ return vocab->token_fim_sep();
+}
+
+// deprecated
+const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token) {
+ return llama_vocab_get_text(vocab, token);
+}
+
+// deprecated
+float llama_token_get_score(const struct llama_vocab * vocab, llama_token token) {
+ return llama_vocab_get_score(vocab, token);
+}
+
+// deprecated
+enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token) {
+ return llama_vocab_get_attr(vocab, token);
+}
+
+// deprecated
+bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token) {
+ return llama_vocab_is_eog(vocab, token);
+}
+
+// deprecated
+bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token) {
+ return llama_vocab_is_control(vocab, token);
+}
+
+// deprecated
+llama_token llama_token_bos(const struct llama_vocab * vocab) {
+ return llama_vocab_bos(vocab);
+}
+
+// deprecated
+llama_token llama_token_eos(const struct llama_vocab * vocab) {
+ return llama_vocab_eos(vocab);
+}
+
+// deprecated
+llama_token llama_token_eot(const struct llama_vocab * vocab) {
+ return llama_vocab_eot(vocab);
+}
+
+// deprecated
+llama_token llama_token_cls(const struct llama_vocab * vocab) {
+ //return llama_vocab_cls(vocab);
+ return llama_vocab_bos(vocab); // avoid deprecation warning
+}
+
+// deprecated
+llama_token llama_token_sep(const struct llama_vocab * vocab) {
+ return llama_vocab_sep(vocab);
+}
+
+// deprecated
+llama_token llama_token_nl (const struct llama_vocab * vocab) {
+ return llama_vocab_nl(vocab);
+}
+
+// deprecated
+llama_token llama_token_pad(const struct llama_vocab * vocab) {
+ return llama_vocab_pad(vocab);
+}
+
+// deprecated
+bool llama_add_bos_token(const struct llama_vocab * vocab) {
+ return llama_vocab_get_add_bos(vocab);
+}
+
+// deprecated
+bool llama_add_eos_token(const struct llama_vocab * vocab) {
+ return llama_vocab_get_add_eos(vocab);
+}
+
+// deprecated
+llama_token llama_token_fim_pre(const struct llama_vocab * vocab) {
+ return llama_vocab_fim_pre(vocab);
+}
+
+// deprecated
+llama_token llama_token_fim_suf(const struct llama_vocab * vocab) {
+ return llama_vocab_fim_suf(vocab);
+}
+
+// deprecated
+llama_token llama_token_fim_mid(const struct llama_vocab * vocab) {
+ return llama_vocab_fim_mid(vocab);
+}
+
+// deprecated
+llama_token llama_token_fim_pad(const struct llama_vocab * vocab) {
+ return llama_vocab_fim_pad(vocab);
+}
+
+// deprecated
+llama_token llama_token_fim_rep(const struct llama_vocab * vocab) {
+ return llama_vocab_fim_rep(vocab);
+}
+
+// deprecated
+llama_token llama_token_fim_sep(const struct llama_vocab * vocab) {
+ return llama_vocab_fim_sep(vocab);
+}
+
+//
+// tokenization
+//
+
+int32_t llama_tokenize(
+ const struct llama_vocab * vocab,
+ const char * text,
+ int32_t text_len,
+ llama_token * tokens,
+ int32_t n_tokens_max,
+ bool add_special,
+ bool parse_special) {
+ return vocab->tokenize(text, text_len, tokens, n_tokens_max, add_special, parse_special);
+}
+
+int32_t llama_token_to_piece(
+ const struct llama_vocab * vocab,
+ llama_token token,
+ char * buf,
+ int32_t length,
+ int32_t lstrip,
+ bool special) {
+ return vocab->token_to_piece(token, buf, length, lstrip, special);
+}
+
+int32_t llama_detokenize(
+ const struct llama_vocab * vocab,
+ const llama_token * tokens,
+ int32_t n_tokens,
+ char * text,
+ int32_t text_len_max,
+ bool remove_special,
+ bool unparse_special) {
+ return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
+}
+
#include <string>
#include <vector>
-#include <unordered_map>
-#include <map>
-#include <set>
-
-static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
- switch (type) {
- case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
- case LLAMA_VOCAB_TYPE_SPM: return "SPM";
- case LLAMA_VOCAB_TYPE_BPE: return "BPE";
- case LLAMA_VOCAB_TYPE_WPM: return "WPM";
- case LLAMA_VOCAB_TYPE_UGM: return "UGM";
- case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
- default: return "unknown";
- }
-}
-
-struct llm_tokenizer;
+#include <memory>
-struct llama_vocab {
- using id = llama_token;
- using token = std::string;
- using tattr = llama_token_attr;
+struct LLM_KV;
+struct llama_model_loader;
+struct llama_vocab {
struct token_data {
- token text;
- float score;
- tattr attr;
+ std::string text;
+ float score;
+ llama_token_attr attr;
};
- uint32_t n_vocab = 0; // TODO: not great because has to keep in sync with hparams.n_vocab
+ llama_vocab();
+ ~llama_vocab();
+
+ void load(llama_model_loader & ml, const LLM_KV & kv);
- enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
- enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+ enum llama_vocab_type get_type() const;
+ enum llama_vocab_pre_type get_pre_type() const;
- int max_token_len = 0; // used for optimizing longest token search
+ uint32_t n_tokens() const;
+ uint32_t n_token_types() const;
- std::unordered_map<token, id> token_to_id;
- std::vector<token_data> id_to_token;
+ std::string type_name() const;
- std::vector<id> cache_special_tokens;
- std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
+ bool is_normal (llama_token id) const;
+ bool is_unknown (llama_token id) const;
+ bool is_control (llama_token id) const;
+ bool is_byte (llama_token id) const;
+ bool is_user_defined(llama_token id) const;
+ bool is_unused (llama_token id) const;
+ bool is_eog (llama_token id) const;
- std::map<std::pair<std::string, std::string>, int> bpe_ranks;
+ uint8_t token_to_byte(llama_token id) const;
+ llama_token byte_to_token(uint8_t ch) const;
- // default LLaMA special tokens
- // TODO: should we set all of these to LLAMA_TOKEN_NULL?
- id special_bos_id = 1;
- id special_eos_id = 2;
- id special_eot_id = LLAMA_TOKEN_NULL;
- id special_eom_id = LLAMA_TOKEN_NULL;
- id special_unk_id = 0;
- id special_sep_id = LLAMA_TOKEN_NULL;
- id special_pad_id = LLAMA_TOKEN_NULL;
- id special_cls_id = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930
- id special_mask_id = LLAMA_TOKEN_NULL;
+ llama_token text_to_token(const std::string & text) const;
- id linefeed_id = 13;
+ const token_data & get_token_data(llama_token id) const;
- // fim tokens
- id special_fim_pre_id = LLAMA_TOKEN_NULL;
- id special_fim_suf_id = LLAMA_TOKEN_NULL;
- id special_fim_mid_id = LLAMA_TOKEN_NULL;
- id special_fim_pad_id = LLAMA_TOKEN_NULL;
- id special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
- id special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
+ const char * token_get_text (llama_token id) const;
+ float token_get_score(llama_token id) const;
+ llama_token_attr token_get_attr (llama_token id) const;
- // set of all tokens that cause "end of generation"
- std::set<id> special_eog_ids;
+ llama_token token_bos() const;
+ llama_token token_eos() const;
+ llama_token token_eot() const;
+ llama_token token_eom() const;
+ llama_token token_unk() const;
+ llama_token token_sep() const;
+ llama_token token_nl () const;
+ llama_token token_pad() const;
- // tokenizer flags
- bool tokenizer_add_space_prefix = false;
- bool tokenizer_add_bos = false;
- bool tokenizer_add_eos = false;
- bool tokenizer_ignore_merges = false;
- bool tokenizer_clean_spaces = false; // clean_up_tokenization_spaces
- bool tokenizer_remove_extra_whitespaces = false;
- bool tokenizer_escape_whitespaces = true;
- bool tokenizer_treat_whitespace_as_suffix = false;
+ llama_token token_prefix() const;
+ llama_token token_middle() const;
+ llama_token token_suffix() const;
- std::vector<char> precompiled_charsmap;
+ llama_token token_fim_pre() const;
+ llama_token token_fim_suf() const;
+ llama_token token_fim_mid() const;
+ llama_token token_fim_pad() const;
+ llama_token token_fim_rep() const;
+ llama_token token_fim_sep() const;
- llm_tokenizer * tokenizer = nullptr;
+ bool get_add_space_prefix () const;
+ bool get_add_bos () const;
+ bool get_add_eos () const;
+ bool get_ignore_merges () const;
+ bool get_clean_spaces () const;
+ bool get_remove_extra_whitespaces () const;
+ bool get_escape_whitespaces () const;
+ bool get_treat_whitespace_as_suffix() const;
- llama_vocab() = default;
- ~llama_vocab();
+ int max_token_len() const;
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
- void init_tokenizer();
+ int32_t tokenize(
+ const char * text,
+ int32_t text_len,
+ llama_token * tokens,
+ int32_t n_tokens_max,
+ bool add_special,
+ bool parse_special) const;
+
+ std::vector<llama_token> tokenize(
+ const std::string & raw_text,
+ bool add_special,
+ bool parse_special = false) const;
+
+ // does not write null-terminator to buf
+ int32_t token_to_piece(
+ llama_token token,
+ char * buf,
+ int32_t length,
+ int32_t lstrip,
+ bool special) const;
+
+ // use cached data
+ const std::string & token_to_piece(llama_token token) const;
+
+ int32_t detokenize(
+ const llama_token * tokens,
+ int32_t n_tokens,
+ char * text,
+ int32_t text_len_max,
+ bool remove_special,
+ bool unparse_special) const;
+
+ std::string detokenize(
+ const std::vector<llama_token> & tokens,
+ bool special) const;
+
+ void print_info() const;
+
+private:
+ struct impl;
+ std::unique_ptr<impl> pimpl;
};
-
-//
-// internal API
-//
-
-// TODO: rename to llama_tokenize_impl
-// TODO: This should probably be in llama.h
-std::vector<llama_vocab::id> llama_tokenize_internal(
- const llama_vocab & vocab,
- std::string raw_text,
- bool add_special,
- bool parse_special = false);
-
-// TODO: move the API below as member functions of llama_vocab
-llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch);
-
-const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token);
-
-float llama_token_get_score_impl(const struct llama_vocab & vocab, llama_token token);
-
-llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, llama_token token);
-
-bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token);
-
-bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token);
-
-llama_token llama_token_bos_impl(const struct llama_vocab & vocab);
-llama_token llama_token_eos_impl(const struct llama_vocab & vocab);
-llama_token llama_token_eot_impl(const struct llama_vocab & vocab);
-llama_token llama_token_eom_impl(const struct llama_vocab & vocab);
-llama_token llama_token_cls_impl(const struct llama_vocab & vocab);
-llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
-llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
-llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
-
-llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
-llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
-llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
-
-llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab);
-llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab);
-llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab);
-llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab);
-llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab);
-llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab);
-
-bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
-bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
-
-int32_t llama_tokenize_impl(
- const struct llama_vocab & vocab,
- const char * text,
- int32_t text_len,
- llama_token * tokens,
- int32_t n_tokens_max,
- bool add_special,
- bool parse_special);
-
-// does not write null-terminator to buf
-int32_t llama_token_to_piece_impl(
- const struct llama_vocab & vocab,
- llama_token token,
- char * buf,
- int32_t length,
- int32_t lstrip,
- bool special);
-
-// check if token0 is contained as a prefix in token1
-bool llama_token_is_prefix_impl(
- const struct llama_vocab & vocab,
- llama_token token0,
- llama_token token1);
-
-int32_t llama_detokenize_impl(
- const struct llama_vocab & vocab,
- const llama_token * tokens,
- int32_t n_tokens,
- char * text,
- int32_t text_len_max,
- bool remove_special,
- bool unparse_special);
-
-std::string llama_detokenize(
- const struct llama_vocab & vocab,
- const std::vector<llama_token> & tokens,
- bool special);
#include "llama-kv-cache.h"
#include "llama-model-loader.h"
#include "llama-model.h"
-#include "llama-quant.h"
#include "ggml.h"
#include "ggml-alloc.h"
#include <algorithm>
#include <array>
#include <cassert>
-#include <cctype>
#include <cfloat>
-#include <cinttypes>
-#include <climits>
#include <cmath>
-#include <cstdarg>
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <functional>
-#include <initializer_list>
-#include <locale>
-#include <map>
-#include <numeric>
-#include <type_traits>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
-//
-// tensor loading (TODO: add llama_tesor_loader?)
-//
-
-static int llama_get_device_count(const llama_model & model) {
- return (int) model.devices.size();
-}
-
-// checks if the weight tensor can be used with the specified buffer type and device
-static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
- GGML_ASSERT(w != nullptr);
-
- if (op == GGML_OP_NONE) {
- return true;
- }
-
- ggml_init_params params = {
- /*.mem_size =*/ ggml_tensor_overhead()*8,
- /*.mem_buffer =*/ NULL,
- /*.no_alloc =*/ true,
- };
- ggml_context_ptr ctx_ptr { ggml_init(params) };
- if (!ctx_ptr) {
- throw std::runtime_error(format("failed to create ggml context"));
- }
- ggml_context * ctx = ctx_ptr.get();
-
- ggml_tensor * op_tensor = nullptr;
-
- switch (op) {
- case GGML_OP_GET_ROWS:
- {
- ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
- op_tensor = ggml_get_rows(ctx, w, b);
- } break;
- case GGML_OP_MUL_MAT:
- {
- ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
- op_tensor = ggml_mul_mat(ctx, w, b);
- } break;
- case GGML_OP_MUL_MAT_ID:
- {
- int n_expert_used = hparams.n_expert_used;
- ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
- ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
- op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
- } break;
- case GGML_OP_ADD:
- {
- ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
- op_tensor = ggml_add(ctx, a, w);
- } break;
- case GGML_OP_MUL:
- {
- ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
- op_tensor = ggml_mul(ctx, a, w);
- } break;
- case GGML_OP_DIV:
- {
- ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
- op_tensor = ggml_div(ctx, a, w);
- } break;
- case GGML_OP_ROPE:
- {
- int n_embd_head = hparams.n_embd_head_v;
- int n_head = hparams.n_head();
- ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
- ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
- op_tensor = ggml_rope_ext(
- ctx, a, b, w,
- 0, 0, 0, 0, 0,
- 0, 0, 0, 0
- );
-
- } break;
- case GGML_OP_SSM_CONV:
- {
- // FIXME
- ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 12345, w->ne[1], 6789);
- op_tensor = ggml_ssm_conv(ctx, conv_x, w);
- } break;
- case GGML_OP_SSM_SCAN:
- {
- // FIXME
- const int64_t d_state = w->ne[0];
- const int64_t d_inner = w->ne[1];
- const int64_t n_seq_tokens = 512;
- const int64_t n_seqs = 1;
- ggml_tensor * s = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, d_inner, n_seqs);
- ggml_tensor * x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
- ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
- ggml_tensor * B = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
- ggml_tensor * C = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
- op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C);
- } break;
- case GGML_OP_RWKV_WKV6:
- {
- // FIXME
- const int64_t S = 123;
- const int64_t H = 123;
- const int64_t n_tokens = 123;
- const int64_t n_seqs = 123;
- ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, 1, H, n_tokens);
- ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens);
- ggml_tensor * r = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens);
- ggml_tensor * tf = w;
- ggml_tensor * td = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens);
- ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
- op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
- } break;
- case GGML_OP_IM2COL:
- {
- const int n_embd = hparams.n_embd;
- ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
- op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
- } break;
- default:
- GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
- }
-
- // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
- GGML_ASSERT(w->buffer == nullptr);
- w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
- bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
- ggml_backend_buffer_free(w->buffer);
- w->buffer = nullptr;
-
- return op_supported;
-}
-
-// find the first buffer type in the list that can use the tensor
-static ggml_backend_buffer_type_t select_weight_buft(const llama_model & model, ggml_tensor * tensor, ggml_op op, const llama_model::buft_list_t & buft_list) {
- GGML_ASSERT(!buft_list.empty());
- for (const auto & cur : buft_list) {
- ggml_backend_dev_t cur_dev = cur.first;
- ggml_backend_buffer_type_t cur_buft = cur.second;
- if (weight_buft_supported(model.hparams, tensor, op, cur_buft, cur_dev)) {
- return cur_buft;
- }
- }
- return nullptr;
-}
-
-// CPU: ACCEL -> CPU extra -> GPU host -> CPU
-static llama_model::buft_list_t make_cpu_buft_list(llama_model & model) {
- llama_model::buft_list_t buft_list;
-
- // add ACCEL buffer types
- for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
- ggml_backend_dev_t dev = ggml_backend_dev_get(i);
- if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
- auto * buft = ggml_backend_dev_buffer_type(dev);
- // skip
- if (buft != ggml_backend_cpu_buffer_type()) {
- buft_list.emplace_back(dev, buft);
- }
- }
- }
-
- // add extra buffer types
- auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
- auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
- auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
- ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
- if (ggml_backend_dev_get_extra_bufts_fn) {
- ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
- while (extra_bufts && *extra_bufts) {
- buft_list.emplace_back(cpu_dev, *extra_bufts);
- ++extra_bufts;
- }
- }
-
- // add a host buffer type
- // storing the tensors in a host buffer is useful when the processing of large batches
- // is offloaded to a GPU device, since it reduces the time spent on data transfers
- // generally, this will be done using the first device in the list
- // a better approach would be to handle this on a weight-by-weight basis using the offload_op
- // function of the device to determine if it would benefit from being stored in a host buffer
- for (auto * dev : model.devices) {
- ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
- if (buft) {
- buft_list.emplace_back(dev, buft);
- break;
- }
- }
-
- // add the CPU buffer type
- for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
- ggml_backend_dev_t dev = ggml_backend_dev_get(i);
- if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
- buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
- }
- }
-
- return buft_list;
-}
-
-// GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU
-static llama_model::buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, enum llama_split_mode split_mode, const float * tensor_split) {
- llama_model::buft_list_t buft_list;
-
- // add the device split buffer type if requested and available
- if (split_mode == LLAMA_SPLIT_MODE_ROW) {
- ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
- auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t)
- ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
- if (ggml_backend_split_buffer_type_fn) {
- size_t dev_index = [&]() {
- auto * reg = ggml_backend_dev_backend_reg(dev);
- for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
- if (ggml_backend_reg_dev_get(reg, i) == dev) {
- return i;
- }
- }
- throw std::runtime_error(format("device %s not found in its backend reg", ggml_backend_dev_name(dev)));
- }();
- auto * buft = ggml_backend_split_buffer_type_fn(dev_index, tensor_split);
- if (buft != nullptr) {
- buft_list.emplace_back(dev, buft);
- }
- }
- }
-
- // add the device default buffer type
- buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
-
- return buft_list;
-}
-
-// Returns false if cancelled by progress_callback
-static bool llm_load_tensors(
- llama_model_loader & ml,
- llama_model & model,
- int n_gpu_layers,
- enum llama_split_mode split_mode,
- int main_gpu,
- const float * tensor_split,
- bool use_mlock,
- llama_progress_callback progress_callback,
- void * progress_callback_user_data) {
- auto & hparams = model.hparams;
-
- model.split_mode = split_mode;
- model.main_gpu = main_gpu;
- model.n_gpu_layers = n_gpu_layers;
-
- const int n_layer = hparams.n_layer;
-
- bool use_mmap_buffer = true;
-
- // build a list of buffer types for the CPU and GPU devices
- model.cpu_buft_list = make_cpu_buft_list(model);
- for (auto * dev : model.devices) {
- llama_model::buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
- // add CPU buffer types as a fallback
- buft_list.insert(buft_list.end(), model.cpu_buft_list.begin(), model.cpu_buft_list.end());
- model.gpu_buft_list.emplace(dev, std::move(buft_list));
- }
-
- // calculate the split points
- int device_count = llama_get_device_count(model);
- bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
- std::vector<float> splits(device_count);
- if (all_zero) {
- // default split, by free memory
- for (int i = 0; i < device_count; ++i) {
- ggml_backend_dev_t dev = model.devices[i];
- size_t total;
- size_t free;
- ggml_backend_dev_memory(dev, &free, &total);
- splits[i] = free;
- }
- } else {
- std::copy(tensor_split, tensor_split + device_count, splits.begin());
- }
-
- // sum and normalize the splits to get the split points
- float split_sum = 0.0f;
- for (int i = 0; i < device_count; ++i) {
- split_sum += splits[i];
- splits[i] = split_sum;
- }
- for (int i = 0; i < device_count; ++i) {
- splits[i] /= split_sum;
- }
-
- ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
- const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
- const int act_gpu_layers = model.devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
- auto get_layer_buft_list = [&](int il) -> llama_model::layer_dev {
- if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
- return {cpu_dev, &model.cpu_buft_list};
- }
- int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
- auto * dev = model.devices.at(layer_gpu);
- return {dev, &model.gpu_buft_list.at(dev)};
- };
-
- // assign the input layer
- // there is very little benefit to offloading the input layer, so always keep it on the CPU
- model.dev_input = { cpu_dev, &model.cpu_buft_list };
-
- // assign the repeating layers to the devices according to the splits
- model.dev_layer.resize(n_layer);
- for (int il = 0; il < n_layer; ++il) {
- model.dev_layer[il] = get_layer_buft_list(il);
- }
- // assign the output layer
- model.dev_output = get_layer_buft_list(n_layer);
-
- // one ggml context per buffer type
- int max_n_tensors = ml.n_tensors;
- max_n_tensors += 1; // duplicated output tensor
- max_n_tensors += n_layer*2; // duplicated rope freq tensors
- const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
-
- std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
- auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
- auto it = ctx_map.find(buft);
- if (it == ctx_map.end()) {
- ggml_init_params params = {
- /*.mem_size =*/ ctx_size,
- /*.mem_buffer =*/ NULL,
- /*.no_alloc =*/ true,
- };
- ggml_context * ctx = ggml_init(params);
- if (!ctx) {
- throw std::runtime_error(format("failed to create ggml context"));
- }
- ctx_map[buft] = ctx;
- model.ctxs.emplace_back(ctx);
- return ctx;
- }
- return it->second;
- };
-
- // create tensors for the weights
- {
- // note: cast to int64_t since we will use these for the tensor dimensions
- const int64_t n_head = hparams.n_head();
- const int64_t n_head_kv = hparams.n_head_kv();
- const int64_t n_embd = hparams.n_embd;
- const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
- const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
- const int64_t n_embd_head_k = hparams.n_embd_head_k;
- const int64_t n_embd_head_v = hparams.n_embd_head_v;
- const int64_t n_ff = hparams.n_ff();
- const int64_t n_embd_gqa = n_embd_v_gqa;
- const int64_t n_vocab = hparams.n_vocab;
- const int64_t n_vocab_type = hparams.n_vocab_type;
- const int64_t n_rot = hparams.n_rot;
- const int64_t n_expert = hparams.n_expert;
- const int64_t n_expert_used = hparams.n_expert_used;
- const int64_t n_ctx_train = hparams.n_ctx_train;
-
- if (n_expert > 0 && hparams.n_expert_used == 0) {
- throw std::runtime_error("model has expert layers but no expert layers are used");
- }
-
- int n_moved_tensors = 0;
- ggml_tensor * first_moved_tensor = nullptr;
- ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
- ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
-
- auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
- ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
-
- if (!t_meta) {
- if (flags & llama_model_loader::TENSOR_NOT_REQUIRED) {
- return nullptr;
- }
- throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
- }
-
- // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
- // the tensor is duplicated
- // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
- llm_tensor tn_tensor = tn.tensor;
- if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & llama_model_loader::TENSOR_DUPLICATED) {
- tn_tensor = LLM_TENSOR_OUTPUT;
- }
-
- llm_tensor_info info;
- try {
- info = llm_tensor_info_for(tn_tensor);
- } catch (const std::out_of_range & e) {
- throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
- }
-
- // tensors with "bias" suffix are always used with GGML_OP_ADD
- ggml_op op;
- bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
- if (bias) {
- op = GGML_OP_ADD;
- } else {
- op = info.op;
- }
-
- // sanity checks
- if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
- if (tn.bid != -1) {
- GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
- }
- } else {
- if (tn.bid == -1) {
- GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
- }
- }
-
- // select the buffer type for this tensor
- llama_model::buft_list_t * buft_list;
- switch (info.layer) {
- case LLM_TENSOR_LAYER_INPUT:
- buft_list = model.dev_input.buft_list;
- break;
- case LLM_TENSOR_LAYER_OUTPUT:
- buft_list = model.dev_output.buft_list;
- break;
- case LLM_TENSOR_LAYER_REPEATING:
- buft_list = model.dev_layer.at(tn.bid).buft_list;
- break;
- default:
- GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
- }
-
- ggml_backend_buffer_type_t buft = select_weight_buft(model, t_meta, op, *buft_list);
- if (!buft) {
- throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
- }
-
- // avoid using a host buffer when using mmap
- auto * buft_dev = ggml_backend_buft_get_device(buft);
- if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
- auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
- buft = ggml_backend_dev_buffer_type(cpu_dev);
- }
-
- if (buft != buft_list->front().second) {
- n_moved_tensors++;
- if (!first_moved_tensor) {
- first_moved_tensor = t_meta;
- first_moved_from_buft = buft_list->front().second;
- first_moved_to_buft = buft;
- }
- }
-
- ggml_context * ctx = ctx_for_buft(buft);
-
- // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
- if (flags & llama_model_loader::TENSOR_DUPLICATED) {
- ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
- if (t) {
- return t;
- }
- }
- return ml.create_tensor(ctx, tn, ne, flags);
- };
-
- model.layers.resize(n_layer);
-
- // TODO: move to a separate function
- const auto tn = LLM_TN(model.arch);
- switch (model.arch) {
- case LLM_ARCH_LLAMA:
- case LLM_ARCH_REFACT:
- case LLM_ARCH_MINICPM:
- case LLM_ARCH_GRANITE:
- case LLM_ARCH_GRANITE_MOE:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- // if output is NULL, init from the input tok embed
- if (model.output == NULL) {
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
- }
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-
- // optional bias tensors
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
- if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
- layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
- layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
- }
- else {
- layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
- }
-
- if (n_expert == 0) {
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
-
- // optional MLP bias
- layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
- } else {
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
- }
- }
- } break;
- case LLM_ARCH_DECI:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- // if output is NULL, init from the input tok embed
- if (model.output == NULL) {
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
- }
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
- const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
- const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
- const int64_t n_ff = hparams.n_ff(i);
- const int64_t n_head = hparams.n_head(i);
- const int64_t n_head_kv = hparams.n_head_kv(i);
-
- if (n_head_kv == 0 && n_head > 0) {
- // linear attention for DeciLMCausalModel
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- }
- else if (n_head_kv > 0) {
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
- }
-
- // optional bias tensors
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
- if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
- layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
- layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
- }
- else {
- layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
- }
-
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
-
- // optional MLP bias
- layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
- }
- } break;
- case LLM_ARCH_MINICPM3:
- {
- const int64_t n_embd_head_qk_rope = hparams.n_rot;
- const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
-
- const int64_t q_lora_rank = hparams.n_lora_q;
- const int64_t kv_lora_rank = hparams.n_lora_kv;
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- // if output is NULL, init from the input tok embed
- if (model.output == NULL) {
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
- }
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
-
- layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
-
- layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
- layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
-
- layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
- layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
-
- layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
- layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
- }
- } break;
- case LLM_ARCH_GROK:
- {
- if (n_expert == 0) {
- throw std::runtime_error("Grok model cannot have zero experts");
- }
-
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- // if output is NULL, init from the input tok embed
- if (model.output == NULL) {
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
- }
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
- layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
-
- layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
- }
- } break;
- case LLM_ARCH_DBRX:
- {
- if (n_expert == 0) {
- throw std::runtime_error("DBRX model cannot have zero experts");
- }
-
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
- layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
-
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
- }
- } break;
- case LLM_ARCH_BAICHUAN:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- {
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
- }
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_FALCON:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- {
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
-
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
- if (!model.output) {
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
- }
- }
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
-
- layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_STARCODER:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- model.pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
-
- // output
- {
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
- if (!model.output) {
- // needs to be on GPU
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
- }
-
- }
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
-
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
-
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
-
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
-
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
- }
- } break;
- case LLM_ARCH_BERT:
- case LLM_ARCH_NOMIC_BERT:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- model.type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}, 0);
-
- if (model.arch == LLM_ARCH_BERT) {
- model.pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
-
- model.cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
- model.cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- model.cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
- model.cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
- }
-
- model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
- model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- if (model.arch == LLM_ARCH_BERT) {
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
-
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
-
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
- } else {
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
- }
-
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
- layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
- layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
-
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-
- if (model.arch == LLM_ARCH_BERT) {
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
- } else {
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- }
-
- layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
- layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
- }
- } break;
- case LLM_ARCH_JINA_BERT_V2:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
- model.type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}, 0); // token_type_embeddings
-
- model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); // LayerNorm
- model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0); //LayerNorm bias
-
- model.cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
- model.cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i]; // JinaBertLayer
-
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
-
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
-
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
-
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens
-
- layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
- layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
-
- layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
-
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
-
- layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
- layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
- }
- } break;
- case LLM_ARCH_BLOOM:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
- model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
-
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
-
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
-
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
-
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
- }
- } break;
- case LLM_ARCH_MPT:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- model.pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
- if (!model.output) {
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
- }
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- // AWQ ScaleActivation layer
- layer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
- }
- } break;
- case LLM_ARCH_STABLELM:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
-
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
- // optional bias tensors, present in Stable LM 2 1.6B
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- // optional q and k layernorms, present in StableLM 2 12B
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_QWEN:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2}, 0);
- }
- } break;
- case LLM_ARCH_QWEN2:
- case LLM_ARCH_QWEN2VL:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
- // if output is NULL, init from the input tok embed
- if (model.output == NULL) {
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
- }
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
- // optional bias tensors
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_QWEN2MOE:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
- // optional bias tensors
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
-
- if (n_expert == 0) {
- throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");
- }
- if (n_expert_used == 0) {
- throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");
- }
-
- // MoE branch
- const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
-
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
-
- // Shared expert branch
- const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
-
- layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);
- layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
- layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
- layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
- }
- } break;
- case LLM_ARCH_PHI2:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
- model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, 0);
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
-
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- if (layer.wqkv == nullptr) {
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
-
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
-
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
- }
-
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
-
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
-
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
- }
- } break;
- case LLM_ARCH_PHI3:
- {
- const int64_t n_embd_head = n_embd / n_head;
-
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
-
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
-
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
-
- layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
- layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
- }
- } break;
- case LLM_ARCH_PLAMO:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_GPT2:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
- model.pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
-
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
-
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
-
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
-
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
- }
- } break;
- case LLM_ARCH_CODESHELL:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
-
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
-
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
-
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
-
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
- }
- } break;
- case LLM_ARCH_ORION:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
-
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
-
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_INTERNLM2:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- // layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
-
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_GEMMA:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- }
- } break;
- case LLM_ARCH_GEMMA2:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
- layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
- }
- } break;
- case LLM_ARCH_STARCODER2:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
-
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
- // if output is NULL, init from the input tok embed
- if (model.output == NULL) {
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
- }
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
-
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
- // optional bias tensors
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
-
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
-
- // optional bias tensors
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff}, 0);
- }
- } break;
- case LLM_ARCH_MAMBA:
- {
- const int64_t d_conv = hparams.ssm_d_conv;
- const int64_t d_inner = hparams.ssm_d_inner;
- const int64_t d_state = hparams.ssm_d_state;
- const int64_t dt_rank = hparams.ssm_dt_rank;
-
- // only an expansion factor of 2 is supported for now
- if (2 * n_embd != d_inner) {
- throw std::runtime_error("only an expansion factor of 2 is supported for now");
- }
-
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
- // if output is NULL, init from the input tok embed, duplicated to allow offloading
- if (model.output == NULL) {
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
- }
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- // norm
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
- layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
-
- layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
- layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
-
- layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
-
- layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
- layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
-
- // no "weight" suffix for these
- layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
- layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
-
- // out_proj
- layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
- }
- } break;
- case LLM_ARCH_XVERSE:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_COMMAND_R:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- // init output from the input tok embed
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
- if (n_layer >= 64){
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
- }
-
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_COHERE2:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
- // init output from the input tok embed
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
- llama_model_loader::TENSOR_DUPLICATED);
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
-
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
-
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
- }
- }
- break;
- case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
- // if output is NULL, init from the input tok embed
- if (model.output == NULL) {
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
- }
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_OLMO2:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
- layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
-
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
- }
- } break;
- case LLM_ARCH_OLMOE:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
-
- if (n_expert == 0) {
- throw std::runtime_error("n_expert must be > 0");
- }
- if (n_expert_used == 0) {
- throw std::runtime_error("n_expert_used must be > 0");
- }
-
- // MoE branch
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
- }
- } break;
- case LLM_ARCH_OPENELM:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- // init output from the input tok embed
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
-
- for (int i = 0; i < n_layer; ++i) {
- const int64_t n_head = hparams.n_head(i);
- const int64_t n_head_qkv = 2*hparams.n_head_kv(i) + n_head;
- const int64_t n_ff = hparams.n_ff(i);
-
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_GPTNEOX:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
-
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
-
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
-
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
-
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
- }
- } break;
- case LLM_ARCH_ARCTIC:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- // if output is NULL, init from the input tok embed
- if (model.output == NULL) {
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
- }
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd}, 0);
-
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
- layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
- }
- } break;
- case LLM_ARCH_DEEPSEEK:
- {
-
- const int64_t n_ff_exp = hparams.n_ff_exp;
- const int64_t n_expert_shared = hparams.n_expert_shared;
-
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
- if (i < (int) hparams.n_layer_dense_lead) {
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- } else {
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
-
- if (n_expert == 0) {
- throw std::runtime_error("n_expert must be > 0");
- }
- if (n_expert_used == 0) {
- throw std::runtime_error("n_expert_used must be > 0");
- }
-
- // MoE branch
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
-
- // Shared expert branch
- layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
- layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
- layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
- }
- }
- } break;
- case LLM_ARCH_DEEPSEEK2:
- {
- const bool is_lite = (hparams.n_layer == 27);
-
- const int64_t n_embd_head_qk_rope = hparams.n_rot;
- const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
-
- const int64_t q_lora_rank = hparams.n_lora_q;
- const int64_t kv_lora_rank = hparams.n_lora_kv;
-
- const int64_t n_ff_exp = hparams.n_ff_exp;
- const int64_t n_expert_shared = hparams.n_expert_shared;
-
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- if (!is_lite) {
- layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
- }
-
- layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
-
- if (!is_lite) {
- layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
- layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
- } else {
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- }
-
- layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
- layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
- if (i < (int) hparams.n_layer_dense_lead) {
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- } else {
- layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
- layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- if (n_expert == 0) {
- throw std::runtime_error("n_expert must be > 0");
- }
- if (n_expert_used == 0) {
- throw std::runtime_error("n_expert_used must be > 0");
- }
-
- // MoE branch
- layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
- layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
- layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
-
- // Shared expert branch
- layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
- layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
- layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
- }
- }
- } break;
- case LLM_ARCH_BITNET:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
-
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
-
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
- layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_up_scale = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
- }
- } break;
- case LLM_ARCH_T5:
- {
- const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
-
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output_norm = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);
-
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
- // if output is NULL, init from the input tok embed
- if (model.output == NULL) {
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
- }
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
- layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
-
- layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
-
- layer.attn_norm_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0);
- // this tensor seems to be unused in HF transformers implementation
- layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
- layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_T5ENCODER:
- {
- const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
-
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
- // if output is NULL, init from the input tok embed
- if (model.output == NULL) {
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
- }
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
- layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
-
- layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_JAIS:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
-
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
-
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
-
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
-
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, 0);
-
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
- }
- } break;
- case LLM_ARCH_CHATGLM:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
- layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
- layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
-
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
-
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
- }
- } break;
- case LLM_ARCH_NEMOTRON:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
-
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
- // optional bias tensors
- layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
-
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
-
- // optional MLP bias
- layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
- }
- } break;
- case LLM_ARCH_EXAONE:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
-
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
- layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_RWKV6:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // Block 0, LN0
- model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
- model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
-
- const int time_mix_extra_dim = hparams.time_mix_extra_dim;
- const int time_decay_extra_dim = hparams.time_decay_extra_dim;
- const int head_size = hparams.wkv_head_size;
- const int attn_hidden_size = n_embd;
- const int ffn_size = hparams.n_ff_arr[0];
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
-
- layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
- layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
-
- layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
- layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
-
- layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
- layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, 0);
- layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
- layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, 0);
- layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
- layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, 0);
-
- layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
- layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
- layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
- layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
- layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
- layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
- layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
- layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
-
- layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
- layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
- layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
-
- layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
- layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
-
- layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
- layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
- layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);
- }
-
- } break;
- case LLM_ARCH_CHAMELEON:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
- // if output is NULL, init from the input tok embed
- if (model.output == NULL) {
- model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
- }
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
- layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
- layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd_head_k, n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
- layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd_head_k, n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
-
- layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
- layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
- layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
-
- layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-
- layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
- layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
- layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
- }
- } break;
- case LLM_ARCH_WAVTOKENIZER_DEC:
- {
- model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
-
- model.conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
- model.conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
-
- // posnet
- {
- const int64_t n_embd = hparams.posnet.n_embd;
-
- for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
- auto & layer = model.layers[i].posnet;
-
- // posnet:
- //
- // - resnet
- // - resnet
- // - attn
- // - resnet
- // - resnet
- // - norm
- //
- switch (i) {
- case 0:
- case 1:
- case 3:
- case 4:
- {
- layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
- layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0);
-
- layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
- layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0);
-
- layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
- layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0);
-
- layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
- layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0);
- } break;
- case 2:
- {
- layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
- layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
-
- layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0);
- layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0);
-
- layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0);
- layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0);
-
- layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0);
- layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0);
-
- layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0);
- layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0);
- } break;
- case 5:
- {
- layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
- layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
- } break;
- default: GGML_ABORT("unknown posnet layer");
- };
- }
- }
-
- GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
-
- model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
- model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0);
-
- // convnext
- {
- const int64_t n_embd = hparams.convnext.n_embd;
-
- for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
- auto & layer = model.layers[i].convnext;
-
- layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0);
- layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0);
-
- layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0);
- layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0);
-
- layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0);
- layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
-
- layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0);
- layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0);
-
- layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
- }
-
- // output
- model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
- model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
- }
-
- model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
- model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
- } break;
- default:
- throw std::runtime_error("unknown architecture");
- }
-
- if (n_moved_tensors > 0) {
- LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
- __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
- ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
- }
- }
-
- ml.done_getting_tensors();
-
- ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr);
- model.mappings.reserve(ml.mappings.size());
-
- // create the backend buffers
- std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
- ctx_bufs.reserve(ctx_map.size());
-
- // Ensure we have enough capacity for the maximum backend buffer we will potentially create
- const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
- model.bufs.reserve(n_max_backend_buffer);
-
- for (auto & it : ctx_map) {
- ggml_backend_buffer_type_t buft = it.first;
- ggml_context * ctx = it.second;
-
- // skip contexts without tensors
- if (ggml_get_first_tensor(ctx) == nullptr) {
- continue;
- }
-
- llama_buf_map bufs;
- bufs.reserve(n_max_backend_buffer);
-
- // check if it is possible to use buffer_from_host_ptr with this buffer type
- ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
- if (!dev) {
- // FIXME: workaround for CPU backend buft having a NULL device
- dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
- }
- ggml_backend_dev_props props;
- ggml_backend_dev_get_props(dev, &props);
- bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
- bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
-
- if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
- for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
- // only the mmap region containing the tensors in the model is mapped to the backend buffer
- // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
- // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
- void * addr = nullptr;
- size_t first, last; // NOLINT
- ml.get_mapping_range(&first, &last, &addr, idx, ctx);
- if (first >= last) {
- continue;
- }
- const size_t max_size = ggml_get_max_tensor_size(ctx);
- ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
- if (buf == nullptr) {
- throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
- }
- model.bufs.emplace_back(buf);
- bufs.emplace(idx, buf);
- }
- }
- else {
- ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
- if (buf == nullptr) {
- throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
- }
- model.bufs.emplace_back(buf);
- if (use_mlock && ggml_backend_buffer_is_host(buf)) {
- model.mlock_bufs.emplace_back(new llama_mlock);
- auto & mlock_buf = model.mlock_bufs.back();
- mlock_buf->init (ggml_backend_buffer_get_base(buf));
- mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
- }
- for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
- bufs.emplace(idx, buf);
- }
- }
-
- if (bufs.empty()) {
- throw std::runtime_error("failed to allocate buffer");
- }
-
- for (auto & buf : bufs) {
- // indicate that this buffer contains weights
- // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
- ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
- }
-
- ctx_bufs.emplace_back(ctx, bufs);
- }
-
- if (llama_supports_gpu_offload()) {
- const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
-
- LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
- if (n_gpu_layers > (int) hparams.n_layer) {
- LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
- }
-
- const int max_backend_supported_layers = hparams.n_layer + 1;
- const int max_offloadable_layers = hparams.n_layer + 1;
-
- LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
- }
-
- // print memory requirements per buffer type
- for (auto & buf : model.bufs) {
- LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
- }
-
- // populate tensors_by_name
- for (auto & ctx : model.ctxs) {
- for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
- model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
- }
- }
-
- // load tensor data
- for (auto & it : ctx_bufs) {
- ggml_context * ctx = it.first;
- auto & bufs = it.second;
- if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
- return false;
- }
- }
-
- if (use_mmap_buffer) {
- for (auto & mapping : ml.mappings) {
- model.mappings.emplace_back(std::move(mapping));
- }
- }
-
- return true;
-}
-
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
- model.t_start_us = ggml_time_us();
+ // loading time will be recalculated after the first eval, so
+ // we take page faults deferred by mmap() into consideration
+ model.t_load_us = 0;
+ time_meas tm(model.t_load_us);
+
+ model.t_start_us = tm.t_start_us;
try {
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
+ ml.print_info();
+
model.hparams.vocab_only = params.vocab_only;
try {
- llm_load_arch(ml, model);
+ model.load_arch(ml);
} catch(const std::exception & e) {
throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
}
try {
- llm_load_hparams(ml, model);
+ model.load_hparams(ml);
} catch(const std::exception & e) {
throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
}
try {
- llm_load_vocab(ml, model);
+ model.load_vocab(ml);
} catch(const std::exception & e) {
throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
}
- llm_load_stats(ml, model);
- llm_load_print_meta(ml, model);
-
- if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
- model.hparams.n_vocab != model.vocab.id_to_token.size()) {
- throw std::runtime_error("vocab size mismatch");
- }
+ model.load_stats(ml);
+ model.print_info();
if (params.vocab_only) {
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
return 0;
}
- if (!llm_load_tensors(
- ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
- params.progress_callback, params.progress_callback_user_data
- )) {
+ if (!model.load_tensors(ml)) {
return -2;
}
} catch (const std::exception & err) {
return -1;
}
- // loading time will be recalculate after the first eval, so
- // we take page faults deferred by mmap() into consideration
- model.t_load_us = ggml_time_us() - model.t_start_us;
-
return 0;
}
ggml_set_input(lctx.inp_tokens);
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
+
+ // apply lora for embedding tokens if needed
+ for (auto & it : lctx.lora) {
+ struct llama_adapter_lora_weight * lw = it.first->get_weight(tok_embd);
+ if (lw == nullptr) {
+ continue;
+ }
+ const float adapter_scale = it.second;
+ const float scale = lw->get_scale(it.first->alpha, adapter_scale);
+ struct ggml_tensor * inpL_delta = ggml_scale(ctx, ggml_mul_mat(
+ ctx, lw->b, // non-transposed lora_b
+ ggml_get_rows(ctx, lw->a, lctx.inp_tokens)
+ ), scale);
+ inpL = ggml_add(ctx, inpL, inpL_delta);
+ }
} else {
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
inpL = lctx.inp_embd;
struct ggml_tensor * w,
struct ggml_tensor * cur) {
struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
- for (auto & it : lctx.lora_adapters) {
- struct llama_lora_weight * lora = it.first->get_weight(w);
- if (lora == nullptr) {
+ for (auto & it : lctx.lora) {
+ struct llama_adapter_lora_weight * lw = it.first->get_weight(w);
+ if (lw == nullptr) {
continue;
}
- const float alpha = it.first->alpha;
- const float rank = (float) lora->b->ne[0];
- const float scale = alpha ? it.second * alpha / rank : it.second;
+ const float adapter_scale = it.second;
+ const float scale = lw->get_scale(it.first->alpha, adapter_scale);
struct ggml_tensor * ab_cur = ggml_mul_mat(
- ctx0, lora->b,
- ggml_mul_mat(ctx0, lora->a, cur)
+ ctx0, lw->b,
+ ggml_mul_mat(ctx0, lw->a, cur)
);
ab_cur = ggml_scale(ctx0, ab_cur, scale);
res = ggml_add(ctx0, res, ab_cur);
struct ggml_tensor * cur, // struct ggml_tensor * b
struct ggml_tensor * ids) {
struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids);
- for (auto & it : lctx.lora_adapters) {
- struct llama_lora_weight * lora = it.first->get_weight(w);
- if (lora == nullptr) {
+ for (auto & it : lctx.lora) {
+ struct llama_adapter_lora_weight * lw = it.first->get_weight(w);
+ if (lw == nullptr) {
continue;
}
const float alpha = it.first->alpha;
- const float rank = (float) lora->b->ne[0];
+ const float rank = (float) lw->b->ne[0];
const float scale = alpha ? it.second * alpha / rank : it.second;
struct ggml_tensor * ab_cur = ggml_mul_mat_id(
- ctx0, lora->b,
- ggml_mul_mat_id(ctx0, lora->a, cur, ids),
+ ctx0, lw->b,
+ ggml_mul_mat_id(ctx0, lw->a, cur, ids),
ids
);
ab_cur = ggml_scale(ctx0, ab_cur, scale);
const struct llama_layer * layer,
struct ggml_tensor * cur,
struct ggml_tensor * x_prev,
- struct ggml_tensor ** wkv_state) {
+ struct ggml_tensor ** wkv_state,
+ size_t wkv_head_size,
+ size_t head_count_kv) {
size_t n_embd = cur->ne[0];
size_t n_seq_tokens = cur->ne[1];
size_t n_seqs = cur->ne[2];
- size_t head_size = layer->time_mix_first->ne[0];
- size_t head_count = layer->time_mix_first->ne[1];
+ size_t head_size = wkv_head_size;
+ size_t head_count = n_embd / head_size;
size_t n_tokens = n_seqs * n_seq_tokens;
+ bool is_qrwkv = layer->time_mix_first == nullptr;
+
struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur);
sx = ggml_reshape_2d(ctx, sx, n_embd, n_tokens);
xxx
);
- struct ggml_tensor *mw = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0);
- struct ggml_tensor *mk = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
- struct ggml_tensor *mv = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
- struct ggml_tensor *mr = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
- struct ggml_tensor *mg = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
-
- struct ggml_tensor * xw = ggml_add(
- ctx,
- ggml_mul(
- ctx,
- ggml_add(ctx, mw, layer->time_mix_lerp_w),
- sx
- ),
- cur
- );
+ struct ggml_tensor *xw, *xk, *xv, *xr, *xg;
+ if (layer->time_mix_lerp_fused) {
+ // fusing these weights makes some performance improvement
+ sx = ggml_reshape_3d(ctx, sx, n_embd, 1, n_tokens);
+ cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
+ xxx = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xxx, layer->time_mix_lerp_fused), sx), cur);
+ xw = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0);
+ xk = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
+ xv = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
+ xr = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
+ xg = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
+ } else {
+ // for backward compatibility
+ xw = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0);
+ xk = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
+ xv = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
+ xr = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
+ xg = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
- struct ggml_tensor * xk = ggml_add(
- ctx,
- ggml_mul(
- ctx,
- ggml_add(ctx, mk, layer->time_mix_lerp_k),
- sx
- ),
- cur
- );
+ xw = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xw, layer->time_mix_lerp_w), sx), cur);
+ xk = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xk, layer->time_mix_lerp_k), sx), cur);
+ xv = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xv, layer->time_mix_lerp_v), sx), cur);
+ xr = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xr, layer->time_mix_lerp_r), sx), cur);
+ xg = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xg, layer->time_mix_lerp_g), sx), cur);
+ }
- struct ggml_tensor * xv = ggml_add(
- ctx,
- ggml_mul(
- ctx,
- ggml_add(ctx, mv, layer->time_mix_lerp_v),
- sx
- ),
- cur
- );
+ struct ggml_tensor * r = llm_build_lora_mm(lctx, ctx, layer->time_mix_receptance, xr);
+ struct ggml_tensor * k = llm_build_lora_mm(lctx, ctx, layer->time_mix_key, xk);
+ struct ggml_tensor * v = llm_build_lora_mm(lctx, ctx, layer->time_mix_value, xv);
+ if (layer->time_mix_receptance_b) {
+ r = ggml_add(ctx, r, layer->time_mix_receptance_b);
+ }
+ if (layer->time_mix_key_b) {
+ k = ggml_add(ctx, k, layer->time_mix_key_b);
+ }
+ if (layer->time_mix_value_b) {
+ v = ggml_add(ctx, v, layer->time_mix_value_b);
+ }
- struct ggml_tensor * xr = ggml_add(
- ctx,
- ggml_mul(
- ctx,
- ggml_add(ctx, mr, layer->time_mix_lerp_r),
- sx
- ),
- cur
- );
+ struct ggml_tensor * g = llm_build_lora_mm(lctx, ctx, layer->time_mix_gate, xg);
+ if (is_qrwkv) {
+ g = ggml_sigmoid(ctx, g);
+ } else {
+ g = ggml_silu(ctx, g);
+ }
- struct ggml_tensor * xg = ggml_add(
- ctx,
- ggml_mul(
- ctx,
- ggml_add(ctx, mg, layer->time_mix_lerp_g),
- sx
- ),
- cur
- );
+ if (head_count_kv != head_count) {
+ GGML_ASSERT(head_count % head_count_kv == 0);
+ k = ggml_reshape_4d(ctx, k, head_size, 1, head_count_kv, n_tokens);
+ v = ggml_reshape_4d(ctx, v, head_size, 1, head_count_kv, n_tokens);
+ struct ggml_tensor * tmp = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_size, head_count / head_count_kv, head_count_kv, n_tokens);
+ k = ggml_repeat(ctx, k, tmp);
+ v = ggml_repeat(ctx, v, tmp);
+ }
- struct ggml_tensor * r = ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_receptance, xr), head_size, 1, head_count, n_tokens);
- struct ggml_tensor * k = ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_key, xk), 1, head_size, head_count, n_tokens);
- struct ggml_tensor * v = ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_value, xv), head_size, 1, head_count, n_tokens);
- struct ggml_tensor * g = ggml_silu(
- ctx,
- llm_build_lora_mm(lctx, ctx, layer->time_mix_gate, xg)
- );
+ k = ggml_reshape_3d(ctx, k, head_size, head_count, n_tokens);
+ v = ggml_reshape_3d(ctx, v, head_size, head_count, n_tokens);
+ r = ggml_reshape_3d(ctx, r, head_size, head_count, n_tokens);
struct ggml_tensor * w = ggml_mul_mat(
ctx,
)
);
- w = ggml_add(ctx, w, ggml_reshape_1d(ctx, layer->time_mix_decay, n_embd));
+ w = ggml_add(ctx, w, layer->time_mix_decay);
w = ggml_exp(ctx, ggml_neg(ctx, ggml_exp(ctx, w)));
- w = ggml_reshape_4d(ctx, w, 1, head_size, head_count, n_tokens);
+ w = ggml_reshape_3d(ctx, w, head_size, head_count, n_tokens);
- k = ggml_transpose(ctx, k);
- v = ggml_transpose(ctx, v);
- r = ggml_transpose(ctx, r);
+ if (is_qrwkv) {
+ // k = k * (1 - w)
+ k = ggml_sub(ctx, k, ggml_mul(ctx, k, w));
+ }
- struct ggml_tensor * wkv_output = ggml_rwkv_wkv6(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
+ struct ggml_tensor * wkv_output;
+ if (!layer->time_mix_first) {
+ wkv_output = ggml_gated_linear_attn(ctx, k, v, r, w, *wkv_state, pow(head_size, -0.5f));
+ } else {
+ wkv_output = ggml_rwkv_wkv6(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
+ }
cur = ggml_view_1d(ctx, wkv_output, n_embd * n_tokens, 0);
*wkv_state = ggml_view_1d(ctx, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
- // group norm with head_count groups
- cur = ggml_reshape_3d(ctx, cur, n_embd / head_count, head_count, n_tokens);
- cur = ggml_norm(ctx, cur, 64e-5f);
+ if (!is_qrwkv) {
+ // group norm with head_count groups
+ cur = ggml_reshape_3d(ctx, cur, n_embd / head_count, head_count, n_tokens);
+ cur = ggml_norm(ctx, cur, 64e-5f);
- // Convert back to regular vectors.
- cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
- cur = ggml_add(ctx, ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b);
+ // Convert back to regular vectors.
+ cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
+ cur = ggml_add(ctx, ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b);
+ } else {
+ cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
+ }
cur = ggml_mul(ctx, cur, g);
cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur);
}
struct ggml_cgraph * build_k_shift() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
GGML_ASSERT(kv_self.size == n_ctx);
}
struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
for (uint32_t i = 0; i < ids.size(); ++i) {
const uint32_t id = ids[i];
}
struct ggml_cgraph * build_llama() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
// feed-forward network
if (model.layers[il].ffn_gate_inp == nullptr) {
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, cb, il);
}
struct ggml_cgraph * build_deci() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
}
struct ggml_cgraph * build_baichuan() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
- struct ggml_tensor * inp_pos = model.type == MODEL_7B ? build_inp_pos() : nullptr;
+ struct ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
cb(Vcur, "Vcur", il);
switch (model.type) {
- case MODEL_7B:
+ case LLM_TYPE_7B:
Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
break;
- case MODEL_13B:
+ case LLM_TYPE_13B:
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens);
break;
}
struct ggml_cgraph * build_xverse() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
}
struct ggml_cgraph * build_falcon() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
}
struct ggml_cgraph * build_grok() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
}
struct ggml_cgraph * build_dbrx() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
}
struct ggml_cgraph * build_starcoder() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
}
struct ggml_cgraph * build_refact() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
}
struct ggml_cgraph * build_bert() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
}
struct ggml_cgraph * build_bloom() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
}
struct ggml_cgraph * build_mpt() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
}
struct ggml_cgraph * build_qwen() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
}
struct ggml_cgraph * build_qwen2() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
}
struct ggml_cgraph * build_qwen2vl() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
}
struct ggml_cgraph * build_qwen2moe() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
}
struct ggml_cgraph * build_phi2() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
}
struct ggml_cgraph * build_phi3() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
model.layers[il].attn_norm,
- NULL,
+ model.layers[il].attn_norm_b,
LLM_NORM_RMS, cb, il);
cb(attn_norm_output, "attn_norm", il);
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
- }
- else {
+ } else {
Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
residual = cur;
cur = llm_build_norm(ctx0, cur, hparams,
- model.layers[il].ffn_norm, NULL,
+ model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
LLM_NORM_RMS, cb, il);
cb(cur, "ffn_norm", il);
- // FF
- // special-case: the up and gate tensors are merged into a single tensor
- // TOOD: support into llm_build_ffn
- {
+ // feed-forward network
+ if (model.layers[il].ffn_gate_inp == nullptr) {
cur = llm_build_ffn(ctx0, lctx, cur,
model.layers[il].ffn_up, NULL, NULL,
NULL, NULL, NULL,
NULL,
LLM_FFN_SWIGLU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il);
+ } else {
+ // MoE branch
+ cur = llm_build_moe_ffn(ctx0, lctx, cur,
+ model.layers[il].ffn_gate_inp,
+ model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps,
+ model.layers[il].ffn_down_exps,
+ nullptr,
+ n_expert, n_expert_used,
+ LLM_FFN_SILU, true,
+ false, 0.0,
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+ cb, il);
+ cb(cur, "ffn_moe_out", il);
}
cur = ggml_add(ctx0, residual, cur);
cur = llm_build_norm(ctx0, inpL, hparams,
model.output_norm,
- NULL,
+ model.output_norm_b,
LLM_NORM_RMS, cb, -1);
cb(cur, "result_norm", -1);
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+
+ if (model.output_b != nullptr) {
+ cb(cur, "result_output_no_bias", -1);
+ cur = ggml_add(ctx0, cur, model.output_b);
+ }
cb(cur, "result_output", -1);
ggml_build_forward_expand(gf, cur);
}
struct ggml_cgraph * build_gpt2() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
}
struct ggml_cgraph * build_codeshell() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
}
struct ggml_cgraph * build_orion() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
}
struct ggml_cgraph * build_internlm2() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
}
struct ggml_cgraph * build_minicpm3() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
//TODO: if the model varies, these parameters need to be read from the model
const int64_t n_embd_base = 256;
}
struct ggml_cgraph * build_gemma() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head_k = hparams.n_embd_head_k;
}
struct ggml_cgraph * build_gemma2() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head_k = hparams.n_embd_head_k;
// ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
switch (model.type) {
- case llm_type::MODEL_2B:
- case llm_type::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
- case llm_type::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
+ case LLM_TYPE_2B:
+ case LLM_TYPE_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
+ case LLM_TYPE_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
default: GGML_ABORT("fatal error");
};
cb(Qcur, "Qcur_scaled", il);
struct ggml_cgraph * build_starcoder2() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
}
struct ggml_cgraph * build_mamba() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
struct ggml_cgraph * build_command_r() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
}
struct ggml_cgraph * build_cohere2() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
// * removed bias
// * removed MoE
struct ggml_cgraph * build_olmo() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
}
struct ggml_cgraph * build_olmo2() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
// * removed bias
// * added q, k norm
struct ggml_cgraph * build_olmoe() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
}
struct ggml_cgraph * build_openelm() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
}
struct ggml_cgraph * build_gptneox() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
}
struct ggml_cgraph * build_arctic() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
}
struct ggml_cgraph * build_deepseek() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
}
struct ggml_cgraph * build_deepseek2() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
}
struct ggml_cgraph * build_bitnet() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
}
struct ggml_cgraph * build_t5_enc() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
}
struct ggml_cgraph * build_t5_dec() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
}
struct ggml_cgraph * build_jais() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
}
struct ggml_cgraph * build_chatglm() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
}
struct ggml_cgraph * build_nemotron() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
}
struct ggml_cgraph * build_exaone() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
}
ggml_cgraph * build_rwkv6() {
- ggml_cgraph *gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// Token shift state dimensions should be 2 * n_emb
GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2);
1
);
- cur = ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states));
+ cur = ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, n_embd / hparams.wkv_head_size));
ggml_build_forward_expand(gf, cur);
ggml_build_forward_expand(
gf,
return gf;
}
+ // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
+ ggml_cgraph * build_rwkv6qwen2() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
+
+ GGML_ASSERT(n_embd == hparams.n_embd_k_s());
+
+ const int64_t n_seqs = ubatch.n_seqs;
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+ const int64_t n_tokens = ubatch.n_tokens;
+ GGML_ASSERT(n_seqs != 0);
+ GGML_ASSERT(ubatch.equal_seqs);
+ GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
+
+ struct ggml_tensor * cur;
+ struct ggml_tensor * inpL;
+ struct ggml_tensor * state_copy = build_inp_s_copy();
+ struct ggml_tensor * state_mask = build_inp_s_mask();
+
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+ for (int il = 0; il < n_layer; ++il) {
+ const llama_layer * layer = &model.layers[il];
+
+ // (ab)using the KV cache to store the states
+ struct ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0,
+ gf, kv_self.k_l[il], state_copy, state_mask,
+ hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs);
+ struct ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0,
+ gf, kv_self.v_l[il], state_copy, state_mask,
+ hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs);
+
+ cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
+ token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 1, n_seqs);
+
+ struct ggml_tensor * x_norm_att = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, cb, il);
+ struct ggml_tensor * x_prev = ggml_concat(
+ ctx0,
+ token_shift,
+ ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0),
+ 1
+ );
+
+ ggml_build_forward_expand(
+ gf,
+ ggml_cpy(
+ ctx0,
+ wkv_states,
+ ggml_view_1d(
+ ctx0,
+ kv_self.v_l[il],
+ hparams.n_embd_v_s() * n_seqs,
+ hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
+ )
+ )
+ );
+
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, hparams.n_head_kv()));
+ ggml_build_forward_expand(gf, ffn_inp);
+ ggml_build_forward_expand(
+ gf,
+ ggml_cpy(
+ ctx0,
+ wkv_states,
+ ggml_view_1d(
+ ctx0,
+ kv_self.v_l[il],
+ hparams.n_embd_v_s() * n_seqs,
+ hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
+ )
+ )
+ );
+
+ cb(ffn_inp, "ffn_inp", il);
+
+ // feed-forward network
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = llm_build_ffn(ctx0, lctx, cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+ cb(cur, "ffn_out", il);
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+
+ cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM_RMS, cb, -1);
+ cb(cur, "result_norm", -1);
+
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+ cb(cur, "result_output", -1);
+
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+ }
+
// ref: https://github.com/facebookresearch/chameleon
// based on the original build_llama() function, changes:
// * qk-norm
// * removed bias
// * removed MoE
struct ggml_cgraph * build_chameleon() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
}
struct ggml_cgraph * build_wavtokenizer_dec() {
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
// FIXME: fix in ggml_backend_sched
- const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
+ const bool full_offload = lctx.model.params.n_gpu_layers > (int) lctx.model.hparams.n_layer;
if (ubatch.n_tokens < 32 || full_offload) {
if (il != -1 && strcmp(name, "norm") == 0) {
- const auto & dev_layer = lctx.model.dev_layer.at(il);
+ const auto & dev_layer = lctx.model.dev_layer(il);
for (auto & backend : lctx.backends) {
- if (ggml_backend_get_device(backend.get()) == dev_layer.dev) {
+ if (ggml_backend_get_device(backend.get()) == dev_layer) {
if (ggml_backend_supports_op(backend.get(), cur)) {
ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur, backend.get());
}
result = llm.build_phi2();
} break;
case LLM_ARCH_PHI3:
+ case LLM_ARCH_PHIMOE:
{
result = llm.build_phi3();
} break;
{
result = llm.build_rwkv6();
} break;
+ case LLM_ARCH_RWKV6QWEN2:
+ {
+ result = llm.build_rwkv6qwen2();
+ } break;
case LLM_ARCH_CHAMELEON:
{
result = llm.build_chameleon();
const uint32_t n_tokens_all = batch.n_tokens;
const auto & model = lctx.model;
+ const auto & vocab = model.vocab;
const auto & hparams = model.hparams;
const auto & cparams = lctx.cparams;
if (batch.token) {
for (uint32_t i = 0; i < n_tokens_all; ++i) {
- if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
+ if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
return -1;
}
llama_kv_slot_restorer kv_slot_restorer(kv_self);
const int64_t n_embd = hparams.n_embd;
- const int64_t n_vocab = hparams.n_vocab;
+ const int64_t n_vocab = vocab.n_tokens();
uint32_t n_outputs = 0;
uint32_t n_outputs_prev = 0;
if (batch.token) {
for (uint32_t i = 0; i < n_tokens; ++i) {
- if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
+ if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
return -1;
}
// each move requires 6*n_layer tensors (see build_defrag)
// - source view, destination view, copy operation
// - x2 for keys and values
- //const uint32_t max_moves = llama_model_max_nodes(model)/(6*n_layer);
+ //const uint32_t max_moves = model.max_nodes()/(6*n_layer);
// TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
- const uint32_t max_moves = (llama_model_max_nodes(lctx.model) - 2*n_layer)/(6*n_layer);
+ const uint32_t max_moves = (lctx.model.max_nodes() - 2*n_layer)/(6*n_layer);
// determine which KV cells to move where
//
// build worst-case graph
uint32_t n_seqs = 1; // TODO: worst-case number of sequences
uint32_t n_tokens = std::min(lctx.cparams.n_ctx, lctx.cparams.n_ubatch);
- llama_token token = llama_token_bos(&lctx.model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+ llama_token token = lctx.model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true);
}
}
-int32_t llama_lora_adapter_set(
+int32_t llama_set_adapter_lora(
struct llama_context * ctx,
- struct llama_lora_adapter * adapter,
+ struct llama_adapter_lora * adapter,
float scale) {
- ctx->lora_adapters[adapter] = scale;
+ ctx->lora[adapter] = scale;
return 0;
}
-int32_t llama_lora_adapter_remove(
+int32_t llama_rm_adapter_lora(
struct llama_context * ctx,
- struct llama_lora_adapter * adapter) {
- auto pos = ctx->lora_adapters.find(adapter);
- if (pos != ctx->lora_adapters.end()) {
- ctx->lora_adapters.erase(pos);
+ struct llama_adapter_lora * adapter) {
+ auto pos = ctx->lora.find(adapter);
+ if (pos != ctx->lora.end()) {
+ ctx->lora.erase(pos);
return 0;
}
return -1;
}
-void llama_lora_adapter_clear(struct llama_context * ctx) {
- ctx->lora_adapters.clear();
+void llama_clear_adapter_lora(struct llama_context * ctx) {
+ ctx->lora.clear();
}
-// TODO: tmp
-int32_t llama_control_vector_apply(
- struct llama_context * lctx,
+int32_t llama_apply_adapter_cvec(
+ struct llama_context * ctx,
const float * data,
size_t len,
int32_t n_embd,
int32_t il_start,
int32_t il_end) {
- return llama_control_vector_apply(lctx->cvec, lctx->model, data, len, n_embd, il_start, il_end);
+ return ctx->cvec.apply(ctx->model, data, len, n_embd, il_start, il_end);
}
//
struct llama_model_params params) {
ggml_time_init();
- llama_model * model = new llama_model;
+ llama_model * model = new llama_model(params);
unsigned cur_percentage = 0;
if (params.progress_callback == NULL) {
LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
}
- int status = llama_model_load(path_model, *model, params);
+ const int status = llama_model_load(path_model, *model, params);
GGML_ASSERT(status <= 0);
if (status < 0) {
if (status == -1) {
return model;
}
-struct llama_context * llama_new_context_with_model(
+struct llama_context * llama_init_from_model(
struct llama_model * model,
struct llama_context_params params) {
backend_ptrs.push_back(backend.get());
}
- const size_t max_nodes = llama_model_max_nodes(*model);
+ const size_t max_nodes = model->max_nodes();
// buffer used to store the computation graph and the tensor meta data
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
// TODO: move these checks to ggml_backend_sched
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
bool pipeline_parallel =
- llama_get_device_count(*model) > 1 &&
- model->n_gpu_layers > (int)model->hparams.n_layer &&
- model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
+ model->n_devices() > 1 &&
+ model->params.n_gpu_layers > (int)model->hparams.n_layer &&
+ model->params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
params.offload_kqv;
// pipeline parallelism requires support for async compute and events in all devices
// initialize scheduler with the worst-case graph
uint32_t n_seqs = 1; // TODO: worst-case number of sequences
uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
- llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
+ llama_token token = ctx->model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
ggml_cgraph * gf_pp = llama_build_graph(*ctx, ubatch_pp, true);
return ctx;
}
+struct llama_context * llama_new_context_with_model(
+ struct llama_model * model,
+ struct llama_context_params params) {
+ return llama_init_from_model(model, params);
+}
+
//
// kv cache
//
return ret;
}
-//
-// vocab
-//
-
-// TODO: tmp bridges below until `struct llama_vocab` is exposed through the public API
-
-const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
- return llama_token_get_text_impl(model->vocab, token);
-}
-
-float llama_token_get_score(const struct llama_model * model, llama_token token) {
- return llama_token_get_score_impl(model->vocab, token);
-}
-
-enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) {
- return llama_token_get_attr_impl(model->vocab, token);
-}
-
-bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
- return llama_token_is_eog_impl(model->vocab, token);
-}
-
-bool llama_token_is_control(const struct llama_model * model, llama_token token) {
- return llama_token_is_control_impl(model->vocab, token);
-}
-
-llama_token llama_token_bos(const struct llama_model * model) {
- return llama_token_bos_impl(model->vocab);
-}
-
-llama_token llama_token_eos(const struct llama_model * model) {
- return llama_token_eos_impl(model->vocab);
-}
-
-llama_token llama_token_eot(const struct llama_model * model) {
- return llama_token_eot_impl(model->vocab);
-}
-
-llama_token llama_token_cls(const struct llama_model * model) {
- return llama_token_cls_impl(model->vocab);
-}
-
-llama_token llama_token_sep(const struct llama_model * model) {
- return llama_token_sep_impl(model->vocab);
-}
-
-llama_token llama_token_nl (const struct llama_model * model) {
- return llama_token_nl_impl(model->vocab);
-}
-
-llama_token llama_token_pad(const struct llama_model * model) {
- return llama_token_pad_impl(model->vocab);
-}
-
-bool llama_add_bos_token(const struct llama_model * model) {
- return llama_add_bos_token_impl(model->vocab);
-}
-
-bool llama_add_eos_token(const struct llama_model * model) {
- return llama_add_eos_token_impl(model->vocab);
-}
-
-llama_token llama_token_prefix(const struct llama_model * model) {
- return llama_token_prefix_impl(model->vocab);
-}
-
-llama_token llama_token_middle(const struct llama_model * model) {
- return llama_token_middle_impl(model->vocab);
-}
-
-llama_token llama_token_suffix(const struct llama_model * model) {
- return llama_token_suffix_impl(model->vocab);
-}
-
-llama_token llama_token_fim_pre(const struct llama_model * model) {
- return llama_token_fim_pre_impl(model->vocab);
-}
-
-llama_token llama_token_fim_suf(const struct llama_model * model) {
- return llama_token_fim_suf_impl(model->vocab);
-}
-
-llama_token llama_token_fim_mid(const struct llama_model * model) {
- return llama_token_fim_mid_impl(model->vocab);
-}
-
-llama_token llama_token_fim_pad(const struct llama_model * model) {
- return llama_token_fim_pad_impl(model->vocab);
-}
-
-llama_token llama_token_fim_rep(const struct llama_model * model) {
- return llama_token_fim_rep_impl(model->vocab);
-}
-
-llama_token llama_token_fim_sep(const struct llama_model * model) {
- return llama_token_fim_sep_impl(model->vocab);
-}
-
-//
-// tokenization
-//
-
-int32_t llama_tokenize(
- const struct llama_model * model,
- const char * text,
- int32_t text_len,
- llama_token * tokens,
- int32_t n_tokens_max,
- bool add_special,
- bool parse_special) {
- return llama_tokenize_impl(model->vocab, text, text_len, tokens, n_tokens_max, add_special, parse_special);
-}
-
-int32_t llama_token_to_piece(
- const struct llama_model * model,
- llama_token token,
- char * buf,
- int32_t length,
- int32_t lstrip,
- bool special) {
- return llama_token_to_piece_impl(model->vocab, token, buf, length, lstrip, special);
-}
-
-int32_t llama_detokenize(
- const struct llama_model * model,
- const llama_token * tokens,
- int32_t n_tokens,
- char * text,
- int32_t text_len_max,
- bool remove_special,
- bool unparse_special) {
- return llama_detokenize_impl(model->vocab, tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
-}
-
//
// chat templates
//
int32_t llama_chat_apply_template(
- const struct llama_model * model,
const char * tmpl,
const struct llama_chat_message * chat,
size_t n_msg,
bool add_ass,
char * buf,
int32_t length) {
- std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
- if (tmpl == nullptr) {
- GGML_ASSERT(model != nullptr);
-
- // load template from model, if available
- const auto & it = model->gguf_kv.find("tokenizer.chat_template");
- if (it != model->gguf_kv.end() && it->second.size() > 0) {
- curr_tmpl = it->second;
- }
- else {
- // worst case: there is no information about template, we will use chatml by default
- curr_tmpl = "chatml"; // see llm_chat_apply_template
- }
- }
+ const std::string curr_tmpl(tmpl == nullptr ? "chatml" : tmpl);
// format the chat to string
std::vector<const llama_chat_message *> chat_vec;
return res;
}
-//
-// sampling
-//
-
-// TODO: remove indirection when vocab becomes accesible in llama-sampling.cpp
-struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * model, const char * grammar_str, const char * grammar_root) {
- return llama_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root);
-}
-
-struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model) {
- return llama_sampler_init_infill_impl(model->vocab);
-}
-
-struct llama_sampler * llama_sampler_init_dry(const struct llama_model * model, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
- return llama_sampler_init_dry_impl(model->vocab, llama_n_ctx_train(model), dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, seq_breakers, num_breakers);
-}
-
//
// model split
//
return 0;
}
-int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) {
+int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count) {
std::string str_split_path(split_path);
char postfix[32];
snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
std::string str_postfix(postfix);
- // check if dest ends with postfix
+ // check if split_prefix ends with postfix
int size_prefix = str_split_path.size() - str_postfix.size();
if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
- snprintf(dest, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
+ snprintf(split_prefix, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
return size_prefix;
}
// TODO: show sample usage
//
- // struct llama_vocab; // TODO: add in the future
+ struct llama_vocab;
struct llama_model;
struct llama_context;
struct llama_sampler;
} llama_chat_message;
// lora adapter
- // TODO: rename to llama_adapter_lora
- struct llama_lora_adapter;
+ struct llama_adapter_lora;
// Helpers for getting default parameters
// TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172)
// Call once at the start of the program
LLAMA_API void llama_backend_init(void);
+ // Call once at the end of the program - currently only used for MPI
+ LLAMA_API void llama_backend_free(void);
+
//optional:
LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
// Optional: an auto threadpool gets created in ggml if not passed explicitly
LLAMA_API void llama_attach_threadpool(
- struct llama_context * ctx,
- ggml_threadpool_t threadpool,
- ggml_threadpool_t threadpool_batch);
- LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
+ struct llama_context * ctx,
+ ggml_threadpool_t threadpool,
+ ggml_threadpool_t threadpool_batch);
- // Call once at the end of the program - currently only used for MPI
- LLAMA_API void llama_backend_free(void);
+ LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
const char * path_model,
LLAMA_API void llama_model_free(struct llama_model * model);
- // TODO: rename to llama_init_from_model
- LLAMA_API struct llama_context * llama_new_context_with_model(
+ LLAMA_API struct llama_context * llama_init_from_model(
struct llama_model * model,
struct llama_context_params params);
+ DEPRECATED(LLAMA_API struct llama_context * llama_new_context_with_model(
+ struct llama_model * model,
+ struct llama_context_params params),
+ "use llama_init_from_model instead");
+
// Frees all allocated memory
LLAMA_API void llama_free(struct llama_context * ctx);
LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
- LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
- LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
- LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
- LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
- LLAMA_API int32_t llama_n_head (const struct llama_model * model);
+ DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead");
+ DEPRECATED(LLAMA_API int32_t llama_n_embd (const struct llama_model * model), "use llama_model_n_embd instead");
+ DEPRECATED(LLAMA_API int32_t llama_n_layer (const struct llama_model * model), "use llama_model_n_layer instead");
+ DEPRECATED(LLAMA_API int32_t llama_n_head (const struct llama_model * model), "use llama_model_n_head instead");
+
+ DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
+
+ LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx);
+ LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
- LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
+ LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
+ LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
- LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
- LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
- LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
+ LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
+ LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
+ LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
+ LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
// Get the model's RoPE frequency scaling factor
- LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
+ LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
+
+ LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab);
+
+ LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
// Functions to access the model's GGUF metadata scalar values
// - The functions return the length of the string on success, or -1 on failure
// Returns the total size of all the tensors in the model in bytes
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
+ // Get the default chat template. Returns nullptr if not available
+ LLAMA_API const char * llama_model_chat_template(const struct llama_model * model);
+
// Returns the total number of parameters in the model
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
//
// Load a LoRA adapter from file
- // TODO: rename to llama_adapter_lora_init
- LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
+ LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
struct llama_model * model,
const char * path_lora);
+ // Manually free a LoRA adapter
+ // Note: loaded adapters will be free when the associated model is deleted
+ LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
+
+ // The following functions operate on a llama_context, hence the naming: llama_verb_...
+
// Add a loaded LoRA adapter to given context
// This will not modify model's weight
- // TODO: rename to llama_set_adapter_lora
- LLAMA_API int32_t llama_lora_adapter_set(
+ LLAMA_API int32_t llama_set_adapter_lora(
struct llama_context * ctx,
- struct llama_lora_adapter * adapter,
+ struct llama_adapter_lora * adapter,
float scale);
// Remove a specific LoRA adapter from given context
// Return -1 if the adapter is not present in the context
- // TODO: rename to llama_rm_adapter_lora
- LLAMA_API int32_t llama_lora_adapter_remove(
+ LLAMA_API int32_t llama_rm_adapter_lora(
struct llama_context * ctx,
- struct llama_lora_adapter * adapter);
+ struct llama_adapter_lora * adapter);
// Remove all LoRA adapters from given context
- // TODO: rename to llama_clear_adapter_lora
- LLAMA_API void llama_lora_adapter_clear(struct llama_context * ctx);
-
- // Manually free a LoRA adapter
- // Note: loaded adapters will be free when the associated model is deleted
- // TODO: rename to llama_adapter_lora_free
- LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
+ LLAMA_API void llama_clear_adapter_lora(struct llama_context * ctx);
// Apply a loaded control vector to a llama_context, or if data is NULL, clear
// the currently loaded vector.
// to an n_embd x n_layers buffer starting from layer 1.
// il_start and il_end are the layer range the vector should apply to (both inclusive)
// See llama_control_vector_load in common to load a control vector.
- // TODO: rename to llama_adapter_cvec_apply
- LLAMA_API int32_t llama_control_vector_apply(
- struct llama_context * lctx,
+ LLAMA_API int32_t llama_apply_adapter_cvec(
+ struct llama_context * ctx,
const float * data,
size_t len,
int32_t n_embd,
// Vocab
//
- LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
+ LLAMA_API const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token);
- LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
+ LLAMA_API float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token);
- LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token);
+ LLAMA_API enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token);
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
- LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
+ LLAMA_API bool llama_vocab_is_eog(const struct llama_vocab * vocab, llama_token token);
// Identify if Token Id is a control token or a render-able token
- LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token);
+ LLAMA_API bool llama_vocab_is_control(const struct llama_vocab * vocab, llama_token token);
// Special tokens
- LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
- LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
- LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn
- LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
- LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
- LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
- LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
-
- LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
- LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
-
- // infill tokens
- DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead");
- DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead");
- DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead");
-
- LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model);
- LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model);
- LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model);
- LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model);
- LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model);
- LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model);
+ LLAMA_API llama_token llama_vocab_bos(const struct llama_vocab * vocab); // beginning-of-sentence
+ LLAMA_API llama_token llama_vocab_eos(const struct llama_vocab * vocab); // end-of-sentence
+ LLAMA_API llama_token llama_vocab_eot(const struct llama_vocab * vocab); // end-of-turn
+ LLAMA_API llama_token llama_vocab_sep(const struct llama_vocab * vocab); // sentence separator
+ LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line
+ LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding
+
+ LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
+ LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
+
+ LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
+ LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
+ LLAMA_API llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab);
+ LLAMA_API llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab);
+ LLAMA_API llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab);
+ LLAMA_API llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab);
+
+ DEPRECATED(LLAMA_API const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token), "use llama_vocabable_get_text instead");
+ DEPRECATED(LLAMA_API float llama_token_get_score(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_score instead");
+ DEPRECATED(LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_attr instead");
+ DEPRECATED(LLAMA_API bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_eog instead");
+ DEPRECATED(LLAMA_API bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_control instead");
+ DEPRECATED(LLAMA_API llama_token llama_token_bos(const struct llama_vocab * vocab), "use llama_vocab_bos instead");
+ DEPRECATED(LLAMA_API llama_token llama_token_eos(const struct llama_vocab * vocab), "use llama_vocab_eos instead");
+ DEPRECATED(LLAMA_API llama_token llama_token_eot(const struct llama_vocab * vocab), "use llama_vocab_eot instead");
+ DEPRECATED(LLAMA_API llama_token llama_token_cls(const struct llama_vocab * vocab), "use llama_vocab_cls instead");
+ DEPRECATED(LLAMA_API llama_token llama_token_sep(const struct llama_vocab * vocab), "use llama_vocab_sep instead");
+ DEPRECATED(LLAMA_API llama_token llama_token_nl (const struct llama_vocab * vocab), "use llama_vocab_nl instead");
+ DEPRECATED(LLAMA_API llama_token llama_token_pad(const struct llama_vocab * vocab), "use llama_vocab_pad instead");
+ DEPRECATED(LLAMA_API bool llama_add_bos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_bos instead");
+ DEPRECATED(LLAMA_API bool llama_add_eos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_eos instead");
+ DEPRECATED(LLAMA_API llama_token llama_token_fim_pre(const struct llama_vocab * vocab), "use llama_vocab_fim_pre instead");
+ DEPRECATED(LLAMA_API llama_token llama_token_fim_suf(const struct llama_vocab * vocab), "use llama_vocab_fim_suf instead");
+ DEPRECATED(LLAMA_API llama_token llama_token_fim_mid(const struct llama_vocab * vocab), "use llama_vocab_fim_mid instead");
+ DEPRECATED(LLAMA_API llama_token llama_token_fim_pad(const struct llama_vocab * vocab), "use llama_vocab_fim_pad instead");
+ DEPRECATED(LLAMA_API llama_token llama_token_fim_rep(const struct llama_vocab * vocab), "use llama_vocab_fim_rep instead");
+ DEPRECATED(LLAMA_API llama_token llama_token_fim_sep(const struct llama_vocab * vocab), "use llama_vocab_fim_sep instead");
+
+ // CLS is equivalent to BOS
+ DEPRECATED(LLAMA_API llama_token llama_vocab_cls(const struct llama_vocab * vocab), // classification
+ "use llama_vocab_bos instead");
//
// Tokenization
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
/// as plaintext. Does not insert a leading space.
LLAMA_API int32_t llama_tokenize(
- const struct llama_model * model,
+ const struct llama_vocab * vocab,
const char * text,
int32_t text_len,
llama_token * tokens,
// User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')
// @param special If true, special tokens are rendered in the output.
LLAMA_API int32_t llama_token_to_piece(
- const struct llama_model * model,
+ const struct llama_vocab * vocab,
llama_token token,
char * buf,
int32_t length,
/// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
/// @param unparse_special If true, special tokens are rendered in the output.
LLAMA_API int32_t llama_detokenize(
- const struct llama_model * model,
+ const struct llama_vocab * vocab,
const llama_token * tokens,
int32_t n_tokens,
char * text,
/// @param length The size of the allocated buffer
/// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
LLAMA_API int32_t llama_chat_apply_template(
- const struct llama_model * model,
const char * tmpl,
const struct llama_chat_message * chat,
size_t n_msg,
// llama_sampler_free(smpl);
//
// TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
- // TODO: in the future, the entire sampling API that uses llama_model should start using llama_vocab
//
typedef void * llama_sampler_context_t;
float eta);
LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
- const struct llama_model * model,
+ const struct llama_vocab * vocab,
const char * grammar_str,
const char * grammar_root);
float penalty_present); // 0.0 = disabled
/// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
- LLAMA_API struct llama_sampler * llama_sampler_init_dry(
- const struct llama_model * model,
+ LLAMA_API struct llama_sampler * llama_sampler_init_dry(
+ const struct llama_vocab * vocab,
+ int32_t n_ctx_train,
float dry_multiplier,
float dry_base,
int32_t dry_allowed_length,
// 3. discard non-EOG tokens with low prob
// 4. if no tokens are left -> pick EOT
//
- LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);
+ LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab);
// Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
#include <sstream>
static std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
- auto * model = llama_get_model(ctx);
+ const llama_model * model = llama_get_model(ctx);
+ const llama_vocab * vocab = llama_model_get_vocab(model);
// upper limit for the number of tokens
int n_tokens = text.length() + add_bos;
std::vector<llama_token> result(n_tokens);
- n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, false);
+ n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_bos, false);
if (n_tokens < 0) {
result.resize(-n_tokens);
- int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, false);
+ int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_bos, false);
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
}
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
+ const llama_model * model = llama_get_model(ctx);
+ const llama_vocab * vocab = llama_model_get_vocab(model);
+
std::vector<char> result(8, 0);
- const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), 0, false);
+ const int n_tokens = llama_token_to_piece(vocab, token, result.data(), result.size(), 0, false);
if (n_tokens < 0) {
result.resize(-n_tokens);
- int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), 0, false);
+ int check = llama_token_to_piece(vocab, token, result.data(), result.size(), 0, false);
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
return 1;
}
+ const llama_vocab * vocab_llama = llama_model_get_vocab(model_llama);
+
llama_context_params lcparams = llama_context_default_params();
// tune these to your liking
lcparams.n_threads = params.n_threads;
lcparams.flash_attn = params.flash_attn;
- struct llama_context * ctx_llama = llama_new_context_with_model(model_llama, lcparams);
+ struct llama_context * ctx_llama = llama_init_from_model(model_llama, lcparams);
// print some info about the processing
{
const llama_token id = llama_sampler_sample(smpl, ctx_llama, -1);
- if (id != llama_token_eos(model_llama)) {
+ if (id != llama_vocab_eos(vocab_llama)) {
// add it to the context
embd.push_back(id);