#include <numeric>
#include <queue>
#include <random>
+#include <regex>
#include <sstream>
#include <thread>
#include <unordered_map>
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
-// tensor names
-#define TN_TOKEN_EMBD "token_embd.weight"
-#define TN_OUTPUT_NORM "output_norm.weight"
-#define TN_OUTPUT "output.weight"
-#define TN_ATTN_NORM "blk.%d.attn_norm.weight"
-#define TN_ATTN_Q "blk.%d.attn_q.weight"
-#define TN_ATTN_K "blk.%d.attn_k.weight"
-#define TN_ATTN_V "blk.%d.attn_v.weight"
-#define TN_ATTN_OUTPUT "blk.%d.attn_output.weight"
-#define TN_FFN_NORM "blk.%d.ffn_norm.weight"
-#define TN_FFN_GATE "blk.%d.ffn_gate.weight"
-#define TN_FFN_DOWN "blk.%d.ffn_down.weight"
-#define TN_FFN_UP "blk.%d.ffn_up.weight"
-
#ifdef __GNUC__
#ifdef __MINGW32__
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
//
// logging
//
+
LLAMA_ATTRIBUTE_FORMAT(2, 3)
static void llama_log_internal (llama_log_level level, const char* format, ...);
static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data);
// helpers
//
+static size_t utf8_len(char src) {
+ const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
+ uint8_t highbits = static_cast<uint8_t>(src) >> 4;
+ return lookup[highbits];
+}
+
+void replace_all(std::string & s, const std::string & search, const std::string & replace) {
+ for (size_t pos = 0; ; pos += replace.length()) {
+ pos = s.find(search, pos);
+ if (pos == std::string::npos) break;
+ s.erase(pos, search.length());
+ s.insert(pos, replace);
+ }
+}
+
static void zeros(std::ofstream & file, size_t n) {
char zero = 0;
for (size_t i = 0; i < n; ++i) {
return std::string(buf.data(), size);
}
+//
+// gguf constants (sync with gguf.py)
+//
+
+enum llm_arch {
+ LLM_ARCH_LLAMA,
+ LLM_ARCH_FALCON,
+ LLM_ARCH_GPT2,
+ LLM_ARCH_GPTJ,
+ LLM_ARCH_GPTNEOX,
+ LLM_ARCH_MPT,
+ LLM_ARCH_UNKNOWN,
+};
+
+static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
+ { LLM_ARCH_LLAMA, "llama" },
+ { LLM_ARCH_FALCON, "falcon" },
+ { LLM_ARCH_GPT2, "gpt2" },
+ { LLM_ARCH_GPTJ, "gptj" },
+ { LLM_ARCH_GPTNEOX, "gptneox" },
+ { LLM_ARCH_MPT, "mpt" },
+};
+
+enum llm_kv {
+ LLM_KV_GENERAL_ARCHITECTURE,
+ LLM_KV_GENERAL_QUANTIZATION_VERSION,
+ LLM_KV_GENERAL_ALIGNMENT,
+ LLM_KV_GENERAL_NAME,
+ LLM_KV_GENERAL_AUTHOR,
+ LLM_KV_GENERAL_URL,
+ LLM_KV_GENERAL_DESCRIPTION,
+ LLM_KV_GENERAL_LICENSE,
+ LLM_KV_GENERAL_SOURCE_URL,
+ LLM_KV_GENERAL_SOURCE_HF_REPO,
+
+ LLM_KV_CONTEXT_LENGTH,
+ LLM_KV_EMBEDDING_LENGTH,
+ LLM_KV_BLOCK_COUNT,
+ LLM_KV_FEED_FORWARD_LENGTH,
+ LLM_KV_USE_PARALLEL_RESIDUAL,
+ LLM_KV_TENSOR_DATA_LAYOUT,
+
+ LLM_KV_ATTENTION_HEAD_COUNT,
+ LLM_KV_ATTENTION_HEAD_COUNT_KV,
+ LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
+ LLM_KV_ATTENTION_CLAMP_KQV,
+ LLM_KV_ATTENTION_LAYERNORM_EPS,
+ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
+
+ LLM_KV_ROPE_DIMENSION_COUNT,
+ LLM_KV_ROPE_SCALE_LINEAR,
+
+ LLM_KV_TOKENIZER_MODEL,
+ LLM_KV_TOKENIZER_LIST,
+ LLM_KV_TOKENIZER_TOKEN_TYPE,
+ LLM_KV_TOKENIZER_SCORES,
+ LLM_KV_TOKENIZER_MERGES,
+ LLM_KV_TOKENIZER_BOS_ID,
+ LLM_KV_TOKENIZER_EOS_ID,
+ LLM_KV_TOKENIZER_UNK_ID,
+ LLM_KV_TOKENIZER_SEP_ID,
+ LLM_KV_TOKENIZER_PAD_ID,
+ LLM_KV_TOKENIZER_HF_JSON,
+ LLM_KV_TOKENIZER_RWKV,
+};
+
+static std::map<llm_kv, std::string> LLM_KV_NAMES = {
+ { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
+ { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
+ { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
+ { LLM_KV_GENERAL_NAME, "general.name" },
+ { LLM_KV_GENERAL_AUTHOR, "general.author" },
+ { LLM_KV_GENERAL_URL, "general.url" },
+ { LLM_KV_GENERAL_DESCRIPTION, "general.description" },
+ { LLM_KV_GENERAL_LICENSE, "general.license" },
+ { LLM_KV_GENERAL_SOURCE_URL, "general.source_url" },
+ { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source_hf_repo" },
+
+ { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
+ { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
+ { LLM_KV_BLOCK_COUNT, "%s.block_count" },
+ { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
+ { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
+ { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
+
+ { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
+ { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
+ { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
+ { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
+ { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
+ { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
+
+ { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
+ { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
+
+ { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
+ { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
+ { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
+ { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
+ { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
+ { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
+ { LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" },
+ { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
+ { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
+ { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
+ { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
+ { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
+};
+
+struct LLM_KV {
+ LLM_KV(llm_arch arch) : arch(arch) {}
+
+ llm_arch arch;
+
+ std::string operator()(llm_kv kv) const {
+ return ::format(LLM_KV_NAMES[kv].c_str(), LLM_ARCH_NAMES[arch].c_str());
+ }
+};
+
+enum llm_tensor {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_POS_EMBD,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_ROPE_FREQS,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_NORM_2,
+ LLM_TENSOR_ATTN_ROT_EMBD,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_NORM,
+};
+
+static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
+ {
+ LLM_ARCH_LLAMA,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_OUTPUT, "output" },
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ },
+ },
+ {
+ LLM_ARCH_FALCON,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
+ { LLM_TENSOR_OUTPUT, "output" },
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
+ { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
+ },
+ },
+};
+
+static llm_arch llm_arch_from_string(const std::string & name) {
+ for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
+ if (kv.second == name) {
+ return kv.first;
+ }
+ }
+
+ return LLM_ARCH_UNKNOWN;
+}
+
+// helper to handle gguf constants
+// usage:
+//
+// const auto tn = LLM_TN(LLM_ARCH_LLAMA);
+//
+// std::string name = tn(LLM_TENSOR_OUTPUT); -> "output"
+// std::string name = tn(LLM_TENSOR_TOKEN_EMBD, "bias"); -> "token_embd.bias"
+// std::string name = tn(LLM_TENSOR_ATTN_NORM, "weight", 3); -> "blk.3.attn_norm.weight"
+//
+struct LLM_TN {
+ LLM_TN(llm_arch arch) : arch(arch) {}
+
+ llm_arch arch;
+
+ std::string operator()(llm_tensor tensor) const {
+ return LLM_TENSOR_NAMES[arch].at(tensor);
+ }
+
+ std::string operator()(llm_tensor tensor, const std::string & suffix) const {
+ return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix;
+ }
+
+ std::string operator()(llm_tensor tensor, int bid) const {
+ return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid);
+ }
+
+ std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
+ return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix;
+ }
+};
+
+//
+// gguf helpers
+//
+
+#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
+{ \
+ const std::string skey(key); \
+ const int kid = gguf_find_key(ctx, skey.c_str()); \
+ if (kid >= 0) { \
+ enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
+ if (ktype != (type)) { \
+ throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \
+ } \
+ (dst) = func(ctx, kid); \
+ } else if (req) { \
+ throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
+ } \
+}
+
//
// ggml helpers
//
MODEL_7B,
MODEL_13B,
MODEL_30B,
+ MODEL_40B,
MODEL_65B,
MODEL_70B,
};
static const size_t kB = 1024;
-static const size_t MB = 1024*1024;
+static const size_t MB = kB*kB;
// default hparams (LLaMA 7B)
struct llama_hparams {
uint32_t n_rot = 64;
uint32_t n_ff = 11008;
+ float f_norm_eps = 1e-5;
float f_norm_rms_eps = 1e-5;
float rope_freq_base = 10000.0f;
struct llama_layer {
// normalization
- struct ggml_tensor * attention_norm;
+ struct ggml_tensor * attn_norm;
+ struct ggml_tensor * attn_norm_b;
+ struct ggml_tensor * attn_norm_2;
+ struct ggml_tensor * attn_norm_2_b;
// attention
struct ggml_tensor * wq;
struct ggml_tensor * wk;
struct ggml_tensor * wv;
struct ggml_tensor * wo;
+ struct ggml_tensor * wqkv;
// normalization
struct ggml_tensor * ffn_norm;
// ff
- struct ggml_tensor * w1;
- struct ggml_tensor * w2;
- struct ggml_tensor * w3;
+ struct ggml_tensor * w1; // ffn_gate
+ struct ggml_tensor * w2; // ffn_down
+ struct ggml_tensor * w3; // ffn_up
};
struct llama_kv_cache {
};
struct llama_vocab {
- // TODO:
- // - add a vector of merges
- // so that we can pass it to different types of tokenizers with a common interface
-
using id = int32_t;
using token = std::string;
using ttype = llama_token_type;
ttype type;
};
- llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
+ enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
std::unordered_map<token, id> token_to_id;
std::vector<token_data> id_to_token;
+ std::map<std::pair<std::string, std::string>, int> bpe_ranks;
+
// default LLaMA special tokens
id special_bos_id = 1;
id special_eos_id = 2;
id special_pad_id = -1;
id linefeed_id = 13;
+
+ int find_bpe_rank(std::string token_left, std::string token_right) const {
+ replace_all(token_left, " ", "Ä ");
+ replace_all(token_left, "\n", "ÄŠ");
+ replace_all(token_right, " ", "Ä ");
+ replace_all(token_right, "\n", "ÄŠ");
+
+ auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
+ if (it == bpe_ranks.end()) {
+ return -1;
+ }
+
+ return it->second;
+ }
};
struct llama_model {
e_model type = MODEL_UNKNOWN;
+ llm_arch arch = LLM_ARCH_UNKNOWN;
llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
+ std::string name = "n/a";
+
llama_hparams hparams;
llama_vocab vocab;
struct ggml_tensor * tok_embeddings;
- struct ggml_tensor * norm;
+ struct ggml_tensor * output_norm;
+ struct ggml_tensor * output_norm_b;
struct ggml_tensor * output;
std::vector<llama_layer> layers;
+
int n_gpu_layers;
// context
// key + value cache for the self attention
struct llama_kv_cache kv_self;
- size_t mem_per_token = 0;
-
// decode output (2-dimensional array: [n_tokens][n_vocab])
std::vector<float> logits;
bool logits_all = false;
// model loading and saving
//
-enum llama_file_version {
+enum llama_fver {
GGUF_FILE_VERSION_V1 = 1,
};
-static const char * llama_file_version_name(llama_file_version version) {
+static const char * llama_file_version_name(llama_fver version) {
switch (version) {
case GGUF_FILE_VERSION_V1: return "GGUF V1 (latest)";
}
return "unknown";
}
-static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
+static std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) {
char buf[256];
- snprintf(buf, sizeof(buf), "%5u", ne.at(0));
+ snprintf(buf, sizeof(buf), "%5" PRId64, ne.at(0));
for (size_t i = 1; i < ne.size(); i++) {
- snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5u", ne.at(i));
+ snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, ne.at(i));
}
return buf;
}
bool use_mmap = false;
- llama_file file;
+ llama_file file;
llama_ftype ftype;
- llama_file_version fver;
+ llama_fver fver;
std::unique_ptr<llama_mmap> mapping;
n_kv = gguf_get_n_kv(ctx_gguf);
n_tensors = gguf_get_n_tensors(ctx_gguf);
- fver = (enum llama_file_version) gguf_get_version(ctx_gguf);
+ fver = (enum llama_fver ) gguf_get_version(ctx_gguf);
for (int i = 0; i < n_tensors; i++) {
const char * name = gguf_get_tensor_name(ctx_gguf, i);
}
}
+ std::string get_arch_name() const {
+ const auto kv = LLM_KV(LLM_ARCH_UNKNOWN);
+
+ std::string arch_name;
+ GGUF_GET_KEY(ctx_gguf, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_ARCHITECTURE));
+
+ return arch_name;
+ }
+
+ enum llm_arch get_arch() const {
+ const std::string arch_name = get_arch_name();
+
+ return llm_arch_from_string(arch_name);
+ }
+
const char * get_tensor_name(int i) const {
return gguf_get_tensor_name(ctx_gguf, i);
}
return tensor;
}
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend backend) {
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
if (cur == NULL) {
case MODEL_7B: return "7B";
case MODEL_13B: return "13B";
case MODEL_30B: return "30B";
+ case MODEL_40B: return "40B";
case MODEL_65B: return "65B";
case MODEL_70B: return "70B";
- default: GGML_ASSERT(false);
+ default: return "?B";
}
}
-static void llama_model_load_internal(
- const std::string & fname,
+static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
+ model.arch = ml.get_arch();
+ if (model.arch == LLM_ARCH_UNKNOWN) {
+ throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
+ }
+}
+
+static void llm_load_hparams(
+ llama_model_loader & ml,
llama_model & model,
- llama_vocab & vocab,
int n_ctx,
- int n_batch,
- int n_gpu_layers,
- int main_gpu,
- const float * tensor_split,
- const bool mul_mat_q,
float rope_freq_base,
- float rope_freq_scale,
- bool low_vram,
- ggml_type memory_type,
- bool use_mmap,
- bool use_mlock,
- bool vocab_only,
- llama_progress_callback progress_callback,
- void * progress_callback_user_data) {
- model.t_start_us = ggml_time_us();
+ float rope_freq_scale) {
+ struct gguf_context * ctx = ml.ctx_gguf;
- std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
-
- model.n_gpu_layers = n_gpu_layers;
+ const auto kv = LLM_KV(model.arch);
auto & hparams = model.hparams;
- std::string general_name = "n/a";
- std::string general_arch = "n/a";
+ // get general kv
+ GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
- // read hparams
+ // get hparams kv
+ GGUF_GET_KEY(ctx, hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, kv(LLM_KV_TOKENIZER_LIST));
+ GGUF_GET_KEY(ctx, hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_CONTEXT_LENGTH));
+ GGUF_GET_KEY(ctx, hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
+ GGUF_GET_KEY(ctx, hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
+ GGUF_GET_KEY(ctx, hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
+ GGUF_GET_KEY(ctx, hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
+
+ // n_head_kv is optional, default to n_head
+ hparams.n_head_kv = hparams.n_head;
+ GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
+
+ // TODO: manually setting rope scale should override this
+ // rope_freq_scale (inverse of the kv) is optional
{
- struct gguf_context * ctx = ml->ctx_gguf;
-
-#define GGUF_GET(dst, func, type, req, key) \
- { \
- const int kid = gguf_find_key(ctx, key); \
- if (kid >= 0) { \
- enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
- if (ktype != (type)) { \
- throw std::runtime_error(format("key %s has wrong type: %s", key, gguf_type_name(ktype))); \
- } \
- (dst) = func(ctx, kid); \
- } else if (req) { \
- throw std::runtime_error(format("key not found in model: %s", key)); \
- } \
+ float ropescale = 1.0f;
+ GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
+ if (ropescale != 1.0f) {
+ rope_freq_scale = 1.0f/ropescale;
}
+ }
- std::string tokenizer_name;
- GGUF_GET(tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, "tokenizer.ggml.model");
+ // sanity check for n_rot (optional)
+ {
+ hparams.n_rot = hparams.n_embd / hparams.n_head;
- if (tokenizer_name == "llama") {
- vocab.type = LLAMA_VOCAB_TYPE_SPM;
- } else if (tokenizer_name == "gpt2") {
- vocab.type = LLAMA_VOCAB_TYPE_BPE;
- } else {
- LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
- LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
- vocab.type = LLAMA_VOCAB_TYPE_SPM;
+ GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
+
+ if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
+ throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
}
+ }
- // get hparams kv
- GGUF_GET(hparams.n_vocab, gguf_get_arr_n, GGUF_TYPE_ARRAY, true, "tokenizer.ggml.tokens");
- GGUF_GET(hparams.n_ctx_train, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.context_length");
- GGUF_GET(hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.embedding_length");
- GGUF_GET(hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.feed_forward_length");
- GGUF_GET(hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.attention.head_count");
- GGUF_GET(hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.block_count");
- GGUF_GET(hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, true, "llama.rope.dimension_count");
- GGUF_GET(hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, "llama.attention.layer_norm_rms_epsilon");
+ // arch-specific KVs
+ switch (model.arch) {
+ case LLM_ARCH_LLAMA:
+ {
+ GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
+
+ switch (hparams.n_layer) {
+ case 26: model.type = e_model::MODEL_3B; break;
+ case 32: model.type = e_model::MODEL_7B; break;
+ case 40: model.type = e_model::MODEL_13B; break;
+ case 60: model.type = e_model::MODEL_30B; break;
+ case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break;
+ default: model.type = e_model::MODEL_UNKNOWN;
+ }
+ } break;
+ case LLM_ARCH_FALCON:
+ {
+ GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
- // n_head_kv is optional, default to n_head
- hparams.n_head_kv = hparams.n_head;
- GGUF_GET(hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "llama.attention.head_count_kv");
+ switch (hparams.n_layer) {
+ case 32: model.type = e_model::MODEL_7B; break;
+ case 60: model.type = e_model::MODEL_40B; break;
+ default: model.type = e_model::MODEL_UNKNOWN;
+ }
+ } break;
+ default: (void)0;
+ };
- // TODO: manually setting rope scale should override this
- // rope_freq_scale (inverse of the kv) is optional
- float ropescale = 1.0f;
- GGUF_GET(ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, "llama.rope.scale_linear");
- if (ropescale != 1.0f) {
- rope_freq_scale = 1.0f/ropescale;
- }
+ model.ftype = ml.ftype;
- // get general kv
- GGUF_GET(general_name, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.name");
- GGUF_GET(general_arch, gguf_get_val_str, GGUF_TYPE_STRING, false, "general.architecture");
+ hparams.n_ctx = n_ctx;
+ hparams.rope_freq_base = rope_freq_base;
+ hparams.rope_freq_scale = rope_freq_scale;
+}
- // special tokens
- GGUF_GET(vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.bos_token_id");
- GGUF_GET(vocab.special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.eos_token_id");
- GGUF_GET(vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.unknown_token_id");
- GGUF_GET(vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.separator_token_id");
- GGUF_GET(vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, "tokenizer.ggml.padding_token_id");
+// TODO: This should probably be in llama.h
+static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape);
-#undef GGUF_GET
+static void llm_load_vocab(
+ llama_model_loader & ml,
+ llama_model & model) {
+ auto & vocab = model.vocab;
- switch (hparams.n_layer) {
- case 26: model.type = e_model::MODEL_3B; break;
- case 32: model.type = e_model::MODEL_7B; break;
- case 40: model.type = e_model::MODEL_13B; break;
- case 60: model.type = e_model::MODEL_30B; break;
- case 80: model.type = e_model::MODEL_65B; break;
- default:
- {
- if (hparams.n_layer < 32) {
- model.type = e_model::MODEL_7B;
- }
- } break;
- }
+ struct gguf_context * ctx = ml.ctx_gguf;
- model.ftype = ml->ftype;
+ const auto kv = LLM_KV(model.arch);
- hparams.n_ctx = n_ctx;
+ const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
+ if (token_idx == -1) {
+ throw std::runtime_error("cannot find tokenizer vocab in model file\n");
+ }
- // LLaMAv2
- // TODO: probably not needed
- {
- const auto n_gqa = hparams.n_gqa();
+ const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
+ if (score_idx == -1) {
+ throw std::runtime_error("cannot find tokenizer scores in model file\n");
+ }
- if (model.type == e_model::MODEL_65B && n_gqa == 8) {
- LLAMA_LOG_WARN("%s: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
- model.type = e_model::MODEL_70B;
- }
- }
+ const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
- hparams.rope_freq_base = rope_freq_base;
- hparams.rope_freq_scale = rope_freq_scale;
+ const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
+ if (toktype_idx == -1) {
+ throw std::runtime_error("cannot find token type list in GGUF file\n");
}
- // read vocab
+ const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
+
+ // determine vocab type
{
- struct gguf_context * ctx = ml->ctx_gguf;
+ std::string tokenizer_name;
- vocab.id_to_token.resize(hparams.n_vocab);
+ GGUF_GET_KEY(ctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, kv(LLM_KV_TOKENIZER_MODEL));
- const int token_idx = gguf_find_key(ctx, "tokenizer.ggml.tokens");
- if (token_idx == -1) {
- throw std::runtime_error("cannot find tokenizer vocab in model file\n");
- }
+ if (tokenizer_name == "llama") {
+ vocab.type = LLAMA_VOCAB_TYPE_SPM;
- const int score_idx = gguf_find_key(ctx, "tokenizer.ggml.scores");
- if (score_idx == -1) {
- throw std::runtime_error("cannot find tokenizer scores in model file\n");
- }
+ // default special tokens
+ vocab.special_bos_id = 1;
+ vocab.special_eos_id = 2;
+ vocab.special_unk_id = 0;
+ vocab.special_sep_id = -1;
+ vocab.special_pad_id = -1;
+ } else if (tokenizer_name == "gpt2") {
+ vocab.type = LLAMA_VOCAB_TYPE_BPE;
- const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
+ // read bpe merges and populate bpe ranks
+ const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
+ if (merges_keyidx == -1) {
+ throw std::runtime_error("cannot find tokenizer merges in model file\n");
+ }
- const int toktype_idx = gguf_find_key(ctx, "tokenizer.ggml.token_type");
- if (toktype_idx == -1) {
- throw std::runtime_error("cannot find token type list in GGUF file\n");
- }
+ const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
- const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
+ for (int i = 0; i < n_merges; i++) {
+ const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
- for (uint32_t i = 0; i < hparams.n_vocab; i++) {
- std::string word = gguf_get_arr_str(ctx, token_idx, i);
+ std::string first;
+ std::string second;
- vocab.token_to_id[word] = i;
+ const size_t pos = word.find(' ', 1);
- auto & token_data = vocab.id_to_token[i];
- token_data.text = std::move(word);
- token_data.score = scores[i];
- token_data.type = (llama_token_type) toktypes[i];
+ if (pos != std::string::npos) {
+ first = word.substr(0, pos);
+ second = word.substr(pos + 1);
+ }
- // determine the newline token: 0x0A == 10 == '\n'
- if (token_data.text == "<0x0A>") {
- vocab.linefeed_id = i;
+ vocab.bpe_ranks.emplace(std::make_pair(first, second), i);
}
+
+ // default special tokens
+ vocab.special_bos_id = 11;
+ vocab.special_eos_id = 11;
+ vocab.special_unk_id = -1;
+ vocab.special_sep_id = -1;
+ vocab.special_pad_id = -1;
+ } else {
+ LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
+ LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
+
+ vocab.type = LLAMA_VOCAB_TYPE_SPM;
}
}
- {
- // hparams
- LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml->fver));
- LLAMA_LOG_INFO("%s: arch = %s\n", __func__, general_arch.c_str());
- LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
- LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
- LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
- LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx);
- LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
- LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
- LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
- LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
- LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
- LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
- LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
- LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
- LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
- LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
- LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
- LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
- LLAMA_LOG_INFO("%s: model size = %.2f B\n", __func__, ml->n_elements*1e-9);
-
- // general kv
- LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, general_name.c_str());
-
- // special tokens
- if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
- if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
- if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
- if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
- if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
- if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
- }
-
- if (vocab_only) {
- LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
- return;
+ const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
+
+ vocab.id_to_token.resize(n_vocab);
+
+ for (uint32_t i = 0; i < n_vocab; i++) {
+ std::string word = gguf_get_arr_str(ctx, token_idx, i);
+
+ vocab.token_to_id[word] = i;
+
+ auto & token_data = vocab.id_to_token[i];
+ token_data.text = std::move(word);
+ token_data.score = scores[i];
+ token_data.type = (llama_token_type) toktypes[i];
}
- auto & ctx = model.ctx;
+ // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
+ vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false, false)[0];
+
+ // special tokens
+ GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
+ GGUF_GET_KEY(ctx, vocab.special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_EOS_ID));
+ GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_UNK_ID));
+ GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_SEP_ID));
+ GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_PAD_ID));
+}
+
+static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
+ const auto & hparams = model.hparams;
+ const auto & vocab = model.vocab;
+
+ // hparams
+ LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
+ LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
+ LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
+ LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
+ LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
+ LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
+ LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx);
+ LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
+ LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
+ LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
+ LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
+ LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
+ LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
+ LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
+ LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
+ LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
+ LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
+ LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
+ LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
+ LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
+ LLAMA_LOG_INFO("%s: model size = %.2f B\n", __func__, ml.n_elements*1e-9);
+
+ // general kv
+ LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
+
+ // special tokens
+ if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
+ if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
+ if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
+ if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
+ if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
+ if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
+}
+
+static void llm_load_tensors(
+ llama_model_loader & ml,
+ llama_model & model,
+ int n_batch,
+ int n_gpu_layers,
+ int main_gpu,
+ const float * tensor_split,
+ const bool mul_mat_q,
+ bool low_vram,
+ ggml_type memory_type,
+ bool use_mlock,
+ llama_progress_callback progress_callback,
+ void * progress_callback_user_data) {
+ model.t_start_us = ggml_time_us();
+
+ auto & ctx = model.ctx;
+ auto & hparams = model.hparams;
+
+ model.n_gpu_layers = n_gpu_layers;
size_t ctx_size;
size_t mmapped_size;
- ml->calc_sizes(ctx_size, mmapped_size);
+ ml.calc_sizes(ctx_size, mmapped_size);
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
struct ggml_init_params params = {
/*.mem_size =*/ model.buf.size,
/*.mem_buffer =*/ model.buf.data,
- /*.no_alloc =*/ ml->use_mmap,
+ /*.no_alloc =*/ ml.use_mmap,
};
model.ctx = ggml_init(params);
// prepare memory for the weights
size_t vram_weights = 0;
{
- const uint32_t n_embd = hparams.n_embd;
- const uint32_t n_embd_gqa = hparams.n_embd_gqa();
- const uint32_t n_layer = hparams.n_layer;
- const uint32_t n_vocab = hparams.n_vocab;
+ const int64_t n_embd = hparams.n_embd;
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
+ const int64_t n_layer = hparams.n_layer;
+ const int64_t n_vocab = hparams.n_vocab;
- model.tok_embeddings = ml->create_tensor(ctx, TN_TOKEN_EMBD, {n_embd, n_vocab}, GGML_BACKEND_CPU);
+ const auto tn = LLM_TN(model.arch);
- // "output" tensor
- {
- ggml_backend backend_norm;
- ggml_backend backend_output;
- if (n_gpu_layers > int(n_layer)) { // NOLINT
- // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
- // on Windows however this is detrimental unless everything is on the GPU
+ switch (model.arch) {
+ case LLM_ARCH_LLAMA:
+ {
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+
+ // output
+ {
+ ggml_backend backend_norm;
+ ggml_backend backend_output;
+
+ if (n_gpu_layers > int(n_layer)) {
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
+ // on Windows however this is detrimental unless everything is on the GPU
#ifndef _WIN32
- backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+ backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
#else
- backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+ backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
#endif // _WIN32
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
- } else {
- backend_norm = GGML_BACKEND_CPU;
- backend_output = GGML_BACKEND_CPU;
- }
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
+ } else {
+ backend_norm = GGML_BACKEND_CPU;
+ backend_output = GGML_BACKEND_CPU;
+ }
- model.norm = ml->create_tensor(ctx, TN_OUTPUT_NORM, {n_embd}, backend_norm);
- model.output = ml->create_tensor(ctx, TN_OUTPUT, {n_embd, n_vocab}, backend_output);
- if (backend_norm == GGML_BACKEND_GPU) {
- vram_weights += ggml_nbytes(model.norm);
- }
- if (backend_output == GGML_BACKEND_GPU_SPLIT) {
- vram_weights += ggml_nbytes(model.output);
- }
- }
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
+
+ if (backend_norm == GGML_BACKEND_GPU) {
+ vram_weights += ggml_nbytes(model.output_norm);
+ }
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
+ vram_weights += ggml_nbytes(model.output);
+ }
+ }
- const uint32_t n_ff = hparams.n_ff;
+ const uint32_t n_ff = hparams.n_ff;
- const int i_gpu_start = n_layer - n_gpu_layers;
+ const int i_gpu_start = n_layer - n_gpu_layers;
- model.layers.resize(n_layer);
- for (uint32_t i = 0; i < n_layer; ++i) {
- const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
- const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
+ model.layers.resize(n_layer);
- auto & layer = model.layers[i];
- layer.attention_norm = ml->create_tensor(ctx, format(TN_ATTN_NORM, i), {n_embd}, backend);
+ for (uint32_t i = 0; i < n_layer; ++i) {
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
+ const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
- layer.wq = ml->create_tensor(ctx, format(TN_ATTN_Q, i), {n_embd, n_embd}, backend_split);
- layer.wk = ml->create_tensor(ctx, format(TN_ATTN_K, i), {n_embd, n_embd_gqa}, backend_split);
- layer.wv = ml->create_tensor(ctx, format(TN_ATTN_V, i), {n_embd, n_embd_gqa}, backend_split);
- layer.wo = ml->create_tensor(ctx, format(TN_ATTN_OUTPUT, i), {n_embd, n_embd}, backend_split);
+ auto & layer = model.layers[i];
- layer.ffn_norm = ml->create_tensor(ctx, format(TN_FFN_NORM, i), {n_embd}, backend);
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
- layer.w1 = ml->create_tensor(ctx, format(TN_FFN_GATE, i), {n_embd, n_ff}, backend_split);
- layer.w2 = ml->create_tensor(ctx, format(TN_FFN_DOWN, i), { n_ff, n_embd}, backend_split);
- layer.w3 = ml->create_tensor(ctx, format(TN_FFN_UP, i), {n_embd, n_ff}, backend_split);
+ layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
+ layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
+ layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
- if (backend == GGML_BACKEND_GPU) {
- vram_weights +=
- ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
- ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
- ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
- }
- }
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
+
+ layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
+
+ if (backend == GGML_BACKEND_GPU) {
+ vram_weights +=
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
+ ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
+ }
+ }
+ } break;
+ case LLM_ARCH_FALCON:
+ {
+ // TODO: CPU-only for now
+
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+
+ // output
+ {
+ ggml_backend backend_norm;
+ ggml_backend backend_output;
+
+ if (n_gpu_layers > int(n_layer)) {
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
+ // on Windows however this is detrimental unless everything is on the GPU
+#ifndef _WIN32
+ backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+#else
+ backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+#endif // _WIN32
+
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
+ } else {
+ backend_norm = GGML_BACKEND_CPU;
+ backend_output = GGML_BACKEND_CPU;
+ }
+
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
+ model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
+ }
+
+ const uint32_t n_ff = hparams.n_ff;
+
+ const int i_gpu_start = n_layer - n_gpu_layers;
+
+ model.layers.resize(n_layer);
+
+ for (uint32_t i = 0; i < n_layer; ++i) {
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
+ const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
+
+ auto & layer = model.layers[i];
+
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
+ layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
+
+ if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
+ layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend);
+ layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend);
+ }
+
+ layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
+
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
+ }
+ } break;
+ default:
+ throw std::runtime_error("unknown architecture");
+ };
}
- ml->done_getting_tensors();
+ ml.done_getting_tensors();
// print memory requirements
{
mmapped_size - vram_weights; // weights in VRAM not in memory
// this is the memory required by one llama_state
- const size_t mem_required_state =
- scale*hparams.kv_size();
+ const size_t mem_required_state = scale*hparams.kv_size();
LLAMA_LOG_INFO("%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
}
// populate `tensors_by_name`
- for (int i = 0; i < ml->n_tensors; ++i) {
- struct ggml_tensor * cur = ggml_get_tensor(ctx, ml->get_tensor_name(i));
+ for (int i = 0; i < ml.n_tensors; ++i) {
+ struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
}
}
#endif
- ml->load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
+ ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
if (progress_callback) {
progress_callback(1.0f, progress_callback_user_data);
}
- model.mapping = std::move(ml->mapping);
+ model.mapping = std::move(ml.mapping);
// loading time will be recalculate after the first eval, so
// we take page faults deferred by mmap() into consideration
static bool llama_model_load(
const std::string & fname,
llama_model & model,
- llama_vocab & vocab,
int n_ctx,
int n_batch,
int n_gpu_layers,
llama_progress_callback progress_callback,
void *progress_callback_user_data) {
try {
- llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers,
- main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type,
- use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
- return true;
+ std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
+
+ llm_load_arch (*ml, model);
+ llm_load_hparams(*ml, model, n_ctx, rope_freq_base, rope_freq_scale);
+ llm_load_vocab (*ml, model);
+
+ llm_load_print_meta(*ml, model);
+
+ if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
+ throw std::runtime_error("vocab size mismatch");
+ }
+
+ if (vocab_only) {
+ LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
+ return true;
+ }
+
+ llm_load_tensors(
+ *ml, model, n_batch, n_gpu_layers,
+ main_gpu, tensor_split, mul_mat_q, low_vram, memory_type,
+ use_mlock, progress_callback, progress_callback_user_data);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
return false;
}
+
+ return true;
}
-static struct ggml_cgraph * llama_build_graph(
+static struct ggml_cgraph * llm_build_llama(
llama_context & lctx,
const llama_token * tokens,
const float * embd,
const int n_gpu_layers = model.n_gpu_layers;
- auto & mem_per_token = lctx.mem_per_token;
- auto & buf_compute = lctx.buf_compute;
+ auto & buf_compute = lctx.buf_compute;
struct ggml_init_params params = {
/*.mem_size =*/ buf_compute.size,
offload_func(cur);
ggml_set_name(cur, "rms_norm_0");
- // cur = cur*attention_norm(broadcasted)
- cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
+ // cur = cur*attn_norm(broadcasted)
+ cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
offload_func(cur);
ggml_set_name(cur, "attention_norm_0");
}
( n_ctx)*ggml_element_size(kv_self.v),
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
offload_func_v(v);
- ggml_set_name(v, "v");
+ ggml_set_name(v, "v");
+
+ // important: storing RoPE-ed version of K in the KV cache!
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+ }
+
+ struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+ offload_func_kq(Q);
+ ggml_set_name(Q, "Q");
+
+ struct ggml_tensor * K =
+ ggml_view_3d(ctx0, kv_self.k,
+ n_embd_head, n_past + N, n_head_kv,
+ ggml_element_size(kv_self.k)*n_embd_gqa,
+ ggml_element_size(kv_self.k)*n_embd_head,
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
+ offload_func_kq(K);
+ ggml_set_name(K, "K");
+
+ // K * Q
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+ offload_func_kq(KQ);
+ ggml_set_name(KQ, "KQ");
+
+ // KQ_scaled = KQ / sqrt(n_embd_head)
+ // KQ_scaled shape [n_past + N, N, n_head, 1]
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
+ offload_func_kq(KQ_scaled);
+ ggml_set_name(KQ_scaled, "KQ_scaled");
+
+ // KQ_masked = mask_past(KQ_scaled)
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
+ offload_func_kq(KQ_masked);
+ ggml_set_name(KQ_masked, "KQ_masked");
+
+ // KQ = soft_max(KQ_masked)
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
+ offload_func_v(KQ_soft_max);
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
+
+ // split cached V into n_head heads
+ struct ggml_tensor * V =
+ ggml_view_3d(ctx0, kv_self.v,
+ n_past + N, n_embd_head, n_head_kv,
+ ggml_element_size(kv_self.v)*n_ctx,
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
+ offload_func_v(V);
+ ggml_set_name(V, "V");
+
+#if 1
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+ offload_func_v(KQV);
+ ggml_set_name(KQV, "KQV");
+#else
+ // make V contiguous in memory to speed up the matmul, however we waste time on the copy
+ // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
+ // is there a better way?
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
+#endif
+
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+ offload_func_v(KQV_merged);
+ ggml_set_name(KQV_merged, "KQV_merged");
+
+ // cur = KQV_merged.contiguous().view(n_embd, N)
+ cur = ggml_cpy(ctx0,
+ KQV_merged,
+ ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+ offload_func_v(cur);
+ ggml_set_name(cur, "KQV_merged_contiguous");
+
+ // projection (no bias)
+ cur = ggml_mul_mat(ctx0,
+ model.layers[il].wo,
+ cur);
+ offload_func(cur);
+ ggml_set_name(cur, "result_wo");
+ }
+
+ struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
+ offload_func(inpFF);
+ ggml_set_name(inpFF, "inpFF");
+
+ // feed-forward network
+ {
+ // norm
+ {
+ cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
+ offload_func(cur);
+ ggml_set_name(cur, "rms_norm_1");
+
+ // cur = cur*ffn_norm(broadcasted)
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
+ offload_func(cur);
+ ggml_set_name(cur, "ffn_norm");
+ }
+
+ struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
+ model.layers[il].w3,
+ cur);
+ offload_func(tmp);
+ ggml_set_name(tmp, "result_w3");
+
+ cur = ggml_mul_mat(ctx0,
+ model.layers[il].w1,
+ cur);
+ offload_func(cur);
+ ggml_set_name(cur, "result_w1");
+
+ // SILU activation
+ cur = ggml_silu(ctx0, cur);
+ offload_func(cur);
+ ggml_set_name(cur, "silu");
+
+ cur = ggml_mul(ctx0, cur, tmp);
+ offload_func(cur);
+ ggml_set_name(cur, "silu_x_result_w3");
+
+ cur = ggml_mul_mat(ctx0,
+ model.layers[il].w2,
+ cur);
+ offload_func(cur);
+ ggml_set_name(cur, "result_w2");
+ }
+
+ cur = ggml_add(ctx0, cur, inpFF);
+ offload_func(cur);
+ ggml_set_name(cur, "inpFF_+_result_w2");
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ // norm
+ {
+ cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
+ offload_func_nr(cur);
+ ggml_set_name(cur, "rms_norm_2");
+
+ // cur = cur*norm(broadcasted)
+ cur = ggml_mul(ctx0, cur, model.output_norm);
+ // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
+ ggml_set_name(cur, "result_norm");
+ }
+
+ // lm_head
+ cur = ggml_mul_mat(ctx0, model.output, cur);
+ ggml_set_name(cur, "result_output");
+
+ ggml_build_forward_expand(gf, cur);
+
+ ggml_free(ctx0);
+
+ return gf;
+}
+
+static struct ggml_cgraph * llm_build_falcon(
+ llama_context & lctx,
+ const llama_token * tokens,
+ const float * embd,
+ int n_tokens,
+ int n_past) {
+
+ GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
+
+ const int N = n_tokens;
+
+ const auto & model = lctx.model;
+ const auto & hparams = model.hparams;
+
+ const auto & kv_self = lctx.kv_self;
+
+ GGML_ASSERT(!!kv_self.ctx);
+
+ const int64_t n_embd = hparams.n_embd;
+ const int64_t n_layer = hparams.n_layer;
+ const int64_t n_ctx = hparams.n_ctx;
+ const int64_t n_head = hparams.n_head;
+ const int64_t n_head_kv = hparams.n_head_kv;
+ const int64_t n_embd_head = hparams.n_embd_head();
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+ const float freq_base = hparams.rope_freq_base;
+ const float freq_scale = hparams.rope_freq_scale;
+ const float norm_eps = hparams.f_norm_eps;
+
+ const int n_gpu_layers = model.n_gpu_layers;
+
+ auto & buf_compute = lctx.buf_compute;
+
+ struct ggml_init_params params = {
+ /*.mem_size =*/ buf_compute.size,
+ /*.mem_buffer =*/ buf_compute.data,
+ /*.no_alloc =*/ false,
+ };
+
+ params.no_alloc = true;
+
+ struct ggml_context * ctx0 = ggml_init(params);
+
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+ struct ggml_tensor * cur;
+ struct ggml_tensor * inpL;
+
+ if (tokens) {
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
+ memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
+ }
+ ggml_set_name(inp_tokens, "inp_tokens");
+
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
+ } else {
+#ifdef GGML_USE_MPI
+ GGML_ASSERT(false && "not implemented");
+#endif
+
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
+
+ ggml_allocr_alloc(lctx.alloc, inpL);
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
+ memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
+ }
+ }
+
+ const int i_gpu_start = n_layer - n_gpu_layers;
+ (void) i_gpu_start;
+
+ // offload functions set the tensor output backend to GPU
+ // tensors are GPU-accelerated if any input or the output has been offloaded
+ //
+ // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
+ // in that case ggml_cuda_assign_buffers has no effect
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
+ offload_func_t offload_func_kq = llama_nop;
+ offload_func_t offload_func_v = llama_nop;
+
+#ifdef GGML_USE_CUBLAS
+ if (n_gpu_layers > n_layer) {
+ offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
+ }
+ if (n_gpu_layers > n_layer + 1) {
+ offload_func_v = ggml_cuda_assign_buffers_no_alloc;
+ }
+ if (n_gpu_layers > n_layer + 2) {
+ offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
+ }
+#endif // GGML_USE_CUBLAS
+
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
+ }
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
+
+ for (int il = 0; il < n_layer; ++il) {
+ struct ggml_tensor * attn_norm;
+
+ offload_func_t offload_func = llama_nop;
+
+#ifdef GGML_USE_CUBLAS
+ if (il >= i_gpu_start) {
+ offload_func = ggml_cuda_assign_buffers_no_alloc;
+ }
+#endif // GGML_USE_CUBLAS
+
+ // self-attention
+ // TODO: refactor into common function (shared with LLaMA)
+ {
+ attn_norm = ggml_norm(ctx0, inpL, norm_eps);
+ offload_func(attn_norm);
+
+ attn_norm = ggml_add(ctx0,
+ ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm),
+ model.layers[il].attn_norm_b);
+ offload_func(attn_norm->src[0]);
+ offload_func(attn_norm);
+
+ if (model.layers[il].attn_norm_2) { // Falcon-40B
+ cur = ggml_norm(ctx0, inpL, norm_eps);
+ offload_func(cur);
+
+ cur = ggml_add(ctx0,
+ ggml_mul(ctx0, cur, model.layers[il].attn_norm_2),
+ model.layers[il].attn_norm_2_b);
+ offload_func(cur->src[0]);
+ offload_func(cur);
+ } else { // Falcon 7B
+ cur = attn_norm;
+ }
+
+ // compute QKV
+
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+ offload_func_kq(cur);
+
+ // Note that the strides for Kcur, Vcur are set up so that the
+ // resulting views are misaligned with the tensor's storage
+ // (by applying the K/V offset we shift the tensor's original
+ // view to stick out behind the viewed QKV tensor's allocated
+ // memory, so to say). This is ok because no actual accesses
+ // happen to that out-of-range memory, but it can require some
+ // trickery when trying to accurately dump these views for
+ // debugging.
+
+ const size_t wsize = ggml_type_size(cur->type);
+
+ struct ggml_tensor * tmpq = ggml_view_3d(
+ ctx0, cur, n_embd_head, n_head, N,
+ wsize * n_embd_head,
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
+ 0);
+ offload_func_kq(tmpq);
+
+ struct ggml_tensor * tmpk = ggml_view_3d(
+ ctx0, cur, n_embd_head, n_head_kv, N,
+ wsize * n_embd_head,
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
+ wsize * n_embd_head * n_head);
+ offload_func_kq(tmpk);
+
+ struct ggml_tensor * tmpv = ggml_view_3d(
+ ctx0, cur, n_embd_head, n_head_kv, N,
+ wsize * n_embd_head,
+ wsize * n_embd_head * (n_head + 2 * n_head_kv),
+ wsize * n_embd_head * (n_head + n_head_kv));
+ offload_func_v(tmpv);
+
+ // using mode = 2 for neox mode
+ struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, tmpq, n_past, n_embd_head, 2, 0, freq_base, freq_scale);
+ offload_func_kq(Qcur);
+ struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, tmpk, n_past, n_embd_head, 2, 0, freq_base, freq_scale);
+ offload_func_kq(Kcur);
+
+ {
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N));
+ offload_func_v(Vcur);
+ offload_func_v(Vcur->src[0]->src[0]);
+ ggml_set_name(Vcur, "Vcur");
+
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
+ offload_func_kq(k);
+ ggml_set_name(k, "k");
+
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
+ ( n_ctx)*ggml_element_size(kv_self.v),
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
+ offload_func_v(v);
- // important: storing RoPE-ed version of K in the KV cache!
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
}
- struct ggml_tensor * Q =
- ggml_permute(ctx0,
- Qcur,
- 0, 2, 1, 3);
+ struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
offload_func_kq(Q);
ggml_set_name(Q, "Q");
offload_func_kq(K);
ggml_set_name(K, "K");
- // K * Q
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
offload_func_kq(KQ);
ggml_set_name(KQ, "KQ");
- // KQ_scaled = KQ / sqrt(n_embd_head)
- // KQ_scaled shape [n_past + N, N, n_head, 1]
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
offload_func_kq(KQ_scaled);
ggml_set_name(KQ_scaled, "KQ_scaled");
- // KQ_masked = mask_past(KQ_scaled)
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
offload_func_kq(KQ_masked);
ggml_set_name(KQ_masked, "KQ_masked");
- // KQ = soft_max(KQ_masked)
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
offload_func_v(KQ_soft_max);
ggml_set_name(KQ_soft_max, "KQ_soft_max");
- // split cached V into n_head heads
struct ggml_tensor * V =
ggml_view_3d(ctx0, kv_self.v,
n_past + N, n_embd_head, n_head_kv,
offload_func_v(V);
ggml_set_name(V, "V");
-#if 1
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
offload_func_v(KQV);
ggml_set_name(KQV, "KQV");
-#else
- // make V contiguous in memory to speed up the matmul, however we waste time on the copy
- // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
- // is there a better way?
- struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
-#endif
- // KQV_merged = KQV.permute(0, 2, 1, 3)
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
offload_func_v(KQV_merged);
ggml_set_name(KQV_merged, "KQV_merged");
- // cur = KQV_merged.contiguous().view(n_embd, N)
- cur = ggml_cpy(ctx0,
- KQV_merged,
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+ cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
offload_func_v(cur);
ggml_set_name(cur, "KQV_merged_contiguous");
- // projection (no bias)
- cur = ggml_mul_mat(ctx0,
- model.layers[il].wo,
- cur);
+ cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
offload_func(cur);
ggml_set_name(cur, "result_wo");
}
- struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
- offload_func(inpFF);
- ggml_set_name(inpFF, "inpFF");
+ struct ggml_tensor * attn_out = cur;
- // feed-forward network
+ // feed forward
{
- // norm
- {
- cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
- offload_func(cur);
- ggml_set_name(cur, "rms_norm_1");
-
- // cur = cur*ffn_norm(broadcasted)
- cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
- offload_func(cur);
- ggml_set_name(cur, "ffn_norm");
- }
-
- struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
- model.layers[il].w3,
- cur);
- offload_func(tmp);
- ggml_set_name(tmp, "result_w3");
+ struct ggml_tensor * inpFF = attn_norm;
- cur = ggml_mul_mat(ctx0,
- model.layers[il].w1,
- cur);
- offload_func(cur);
- ggml_set_name(cur, "result_w1");
+ cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF);
- // SILU activation
- cur = ggml_silu(ctx0, cur);
+ // TODO: this is temporary needed to introduce artificial dependency between FF and ATTN
+ // adding this, because there seems to be a bug in the Metal concurrency optimization
+ // without this line, the results are non-deterministic and wrong
+ cur->src[2] = attn_out;
offload_func(cur);
- ggml_set_name(cur, "silu");
- cur = ggml_mul(ctx0, cur, tmp);
+ cur = ggml_gelu(ctx0, cur);
offload_func(cur);
- ggml_set_name(cur, "silu_x_result_w3");
-
- cur = ggml_mul_mat(ctx0,
- model.layers[il].w2,
- cur);
+ cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
offload_func(cur);
- ggml_set_name(cur, "result_w2");
}
- cur = ggml_add(ctx0, cur, inpFF);
+ cur = ggml_add(ctx0, cur, attn_out);
+ offload_func(cur);
+ cur = ggml_add(ctx0, cur, inpL);
offload_func(cur);
- ggml_set_name(cur, "inpFF_+_result_w2");
// input for next layer
inpL = cur;
}
+ cur = inpL;
+
// norm
{
- cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
+ cur = ggml_norm(ctx0, cur, norm_eps);
offload_func_nr(cur);
- ggml_set_name(cur, "rms_norm_2");
- // cur = cur*norm(broadcasted)
- cur = ggml_mul(ctx0, cur, model.norm);
- // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
+ cur = ggml_add(ctx0,
+ ggml_mul(ctx0, cur, model.output_norm),
+ model.output_norm_b);
ggml_set_name(cur, "result_norm");
}
- // lm_head
cur = ggml_mul_mat(ctx0, model.output, cur);
ggml_set_name(cur, "result_output");
- // logits -> probs
- //cur = ggml_soft_max_inplace(ctx0, cur);
-
ggml_build_forward_expand(gf, cur);
- if (mem_per_token == 0) {
- mem_per_token = ggml_used_mem(ctx0)/N;
- }
-
ggml_free(ctx0);
return gf;
}
+static struct ggml_cgraph * llama_build_graph(
+ llama_context & lctx,
+ const llama_token * tokens,
+ const float * embd,
+ int n_tokens,
+ int n_past) {
+ const auto & model = lctx.model;
+
+ struct ggml_cgraph * result = NULL;
+
+ switch (model.arch) {
+ case LLM_ARCH_LLAMA:
+ {
+ result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past);
+ } break;
+ case LLM_ARCH_FALCON:
+ {
+ result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
+ } break;
+ default:
+ GGML_ASSERT(false);
+ };
+
+ return result;
+}
+
// evaluate the transformer
//
// - lctx: llama context
GGML_ASSERT(!!kv_self.ctx);
- const int64_t n_embd = hparams.n_embd;
- const int64_t n_vocab = hparams.n_vocab;
+ const int64_t n_embd = hparams.n_embd;
+ const int64_t n_vocab = hparams.n_vocab;
ggml_allocr_reset(lctx.alloc);
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
- struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
+ struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
- GGML_ASSERT(strcmp(res->name, "result_output") == 0);
- GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
+ GGML_ASSERT(strcmp(res->name, "result_output") == 0);
+ GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
#if GGML_USE_MPI
const int64_t n_layer = hparams.n_layer;
return word;
}
-static size_t utf8_len(char src) {
- const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
- uint8_t highbits = static_cast<uint8_t>(src) >> 4;
- return lookup[highbits];
-}
-
-struct llama_sp_symbol {
+struct llm_symbol {
using index = int;
index prev;
index next;
size_t n;
};
-static_assert(std::is_trivially_copyable<llama_sp_symbol>::value, "llama_sp_symbol is not trivially copyable");
+static_assert(std::is_trivially_copyable<llm_symbol>::value, "llm_symbol is not trivially copyable");
+
+// SPM tokenizer
+// original implementation:
+// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
-struct llama_sp_bigram {
+struct llm_bigram_spm {
struct comparator {
- bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
+ bool operator()(llm_bigram_spm & l, llm_bigram_spm & r) {
return (l.score < r.score) || (l.score == r.score && l.left > r.left);
}
};
- using queue_storage = std::vector<llama_sp_bigram>;
- using queue = std::priority_queue<llama_sp_bigram, queue_storage, comparator>;
- llama_sp_symbol::index left;
- llama_sp_symbol::index right;
+ using queue_storage = std::vector<llm_bigram_spm>;
+ using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>;
+ llm_symbol::index left;
+ llm_symbol::index right;
float score;
size_t size;
};
-// original implementation:
-// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
-struct llama_tokenizer {
- llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {}
+struct llm_tokenizer_spm {
+ llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {}
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
// split string into utf8 chars
int index = 0;
size_t offs = 0;
while (offs < text.size()) {
- llama_sp_symbol sym;
+ llm_symbol sym;
size_t len = utf8_len(text[offs]);
GGML_ASSERT(offs + len <= text.size());
sym.text = text.c_str() + offs;
sym.prev = index - 1;
sym.next = offs == text.size() ? -1 : index + 1;
index++;
- symbols_.emplace_back(sym);
+ symbols.emplace_back(sym);
}
// seed the work queue with all possible 2-character tokens.
- for (size_t i = 1; i < symbols_.size(); ++i) {
+ for (size_t i = 1; i < symbols.size(); ++i) {
try_add_bigram(i - 1, i);
}
// keep substituting the highest frequency pairs for as long as we can.
- while (!work_queue_.empty()) {
- auto bigram = work_queue_.top();
- work_queue_.pop();
+ while (!work_queue.empty()) {
+ auto bigram = work_queue.top();
+ work_queue.pop();
- auto & left_sym = symbols_[bigram.left];
- auto & right_sym = symbols_[bigram.right];
+ auto & left_sym = symbols[bigram.left];
+ auto & right_sym = symbols[bigram.right];
// if one of the symbols already got merged, skip it.
if (left_sym.n == 0 || right_sym.n == 0 ||
// remove the right sym from the chain
left_sym.next = right_sym.next;
if (right_sym.next >= 0) {
- symbols_[right_sym.next].prev = bigram.left;
+ symbols[right_sym.next].prev = bigram.left;
}
// find more substitutions
try_add_bigram(bigram.left, left_sym.next);
}
- for (int i = 0; i != -1; i = symbols_[i].next) {
- auto & symbol = symbols_[i];
+ for (int i = 0; i != -1; i = symbols[i].next) {
+ auto & symbol = symbols[i];
resegment(symbol, output);
}
}
private:
- void resegment(llama_sp_symbol &symbol, std::vector<llama_vocab::id> &output) {
+ void resegment(llm_symbol & symbol, std::vector<llama_vocab::id> & output) {
auto text = std::string(symbol.text, symbol.n);
- auto token = vocab_.token_to_id.find(text);
+ auto token = vocab.token_to_id.find(text);
// Do we need to support is_unused?
- if (token != vocab_.token_to_id.end()) {
+ if (token != vocab.token_to_id.end()) {
output.push_back((*token).second);
return;
}
if (p == rev_merge.end()) {
// output any symbols that did not form tokens as bytes.
for (int j = 0; j < (int)symbol.n; ++j) {
- llama_vocab::id token_id = llama_byte_to_token(vocab_, symbol.text[j]);
+ llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]);
output.push_back(token_id);
}
return;
}
- resegment(symbols_[p->second.first], output);
- resegment(symbols_[p->second.second], output);
+ resegment(symbols[p->second.first], output);
+ resegment(symbols[p->second.second], output);
}
void try_add_bigram(int left, int right) {
return;
}
- const std::string text = std::string(symbols_[left].text, symbols_[left].n + symbols_[right].n);
- auto token = vocab_.token_to_id.find(text);
+ const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n);
+ auto token = vocab.token_to_id.find(text);
- if (token == vocab_.token_to_id.end()) {
+ if (token == vocab.token_to_id.end()) {
return;
}
- if (static_cast<size_t>((*token).second) >= vocab_.id_to_token.size()) {
+ if (static_cast<size_t>((*token).second) >= vocab.id_to_token.size()) {
return;
}
- const auto &tok_data = vocab_.id_to_token[(*token).second];
+ const auto & tok_data = vocab.id_to_token[(*token).second];
- llama_sp_bigram bigram;
- bigram.left = left;
+ llm_bigram_spm bigram;
+ bigram.left = left;
bigram.right = right;
bigram.score = tok_data.score;
- bigram.size = text.size();
- work_queue_.push(bigram);
+ bigram.size = text.size();
+
+ work_queue.push(bigram);
// Do we need to support is_unused?
rev_merge[text] = std::make_pair(left, right);
}
- const llama_vocab & vocab_;
- std::vector<llama_sp_symbol> symbols_;
- llama_sp_bigram::queue work_queue_;
- std::map<std::string, std::pair<int, int> > rev_merge;
+ const llama_vocab & vocab;
+
+ std::vector<llm_symbol> symbols;
+ llm_bigram_spm::queue work_queue;
+
+ std::map<std::string, std::pair<int, int>> rev_merge;
+};
+
+// BPE tokenizer
+// adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
+// tried to simplify unicode stuff, so most likely does not work 100% correctly!
+
+// TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
+
+struct llm_bigram_bpe {
+ struct comparator {
+ bool operator()(llm_bigram_bpe & l, llm_bigram_bpe & r) {
+ return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
+ }
+ };
+
+ using queue_storage = std::vector<llm_bigram_bpe>;
+ using queue = std::priority_queue<llm_bigram_bpe, queue_storage, comparator>;
+ llm_symbol::index left;
+ llm_symbol::index right;
+ std::string text;
+ int rank;
+ size_t size;
+};
+
+struct llm_tokenizer_bpe {
+ llm_tokenizer_bpe(const llama_vocab & vocab, bool g2ws): vocab(vocab) { flag_g2ws = g2ws; }
+
+ void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
+ int final_prev_index = -1;
+ auto word_collection = bpe_gpt2_preprocess(text);
+
+ symbols_final.clear();
+
+ for (auto & word : word_collection) {
+ work_queue = llm_bigram_bpe::queue();
+ symbols.clear();
+
+ int index = 0;
+ size_t offset = 0;
+
+ while (offset < word.size()) {
+ llm_symbol sym;
+ size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
+ sym.text = word.c_str() + offset;
+ sym.n = 1;
+ sym.n = char_len;
+ offset += sym.n;
+ sym.prev = index - 1;
+ sym.next = offset == word.size() ? -1 : index + 1;
+ index++;
+ symbols.emplace_back(sym);
+ }
+ for (size_t i = 1; i < symbols.size(); ++i) {
+ add_new_bigram(i - 1, i);
+ }
+
+ // build token(s)
+ while (!work_queue.empty()) {
+ auto bigram = work_queue.top();
+ work_queue.pop();
+
+ auto & left_symbol = symbols[bigram.left];
+ auto & right_symbol = symbols[bigram.right];
+
+ if (left_symbol.n == 0 || right_symbol.n == 0) {
+ continue;
+ }
+ std::string left_token = std::string(left_symbol.text, left_symbol.n);
+ std::string right_token = std::string(right_symbol.text, right_symbol.n);
+ if (left_token + right_token != bigram.text) {
+ continue; // Skip this bigram if it's outdated
+ }
+
+ // merge the right sym into the left one
+ left_symbol.n += right_symbol.n;
+ right_symbol.n = 0;
+
+ // remove the right sym from the chain
+ left_symbol.next = right_symbol.next;
+ if (right_symbol.next >= 0) {
+ symbols[right_symbol.next].prev = bigram.left;
+ }
+
+ add_new_bigram(left_symbol.prev, bigram.left); // left side of current symbol
+ add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol
+ }
+
+ // add the fnished tokens to the final list keeping correct order for next and prev
+ for (auto & sym : symbols) {
+ if (sym.n > 0) {
+ sym.prev = final_prev_index;
+ sym.next = -1;
+ if (final_prev_index != -1) {
+ symbols_final[final_prev_index].next = symbols_final.size();
+ }
+ symbols_final.emplace_back(sym);
+ final_prev_index = symbols_final.size() - 1;
+ }
+ }
+ }
+
+ symbols = symbols_final;
+
+ if (!symbols.empty()) {
+ for (int i = 0; i != -1; i = symbols[i].next) {
+ auto & symbol = symbols[i];
+ if (symbol.n == 0) {
+ continue;
+ }
+
+ const std::string str = std::string(symbol.text, symbol.n);
+ const auto token = vocab.token_to_id.find(str);
+
+ if (token == vocab.token_to_id.end()) {
+ for (auto j = str.begin(); j != str.end(); ++j) {
+ std::string byte_str(1, *j);
+ auto token_multibyte = vocab.token_to_id.find(byte_str);
+ if (token_multibyte == vocab.token_to_id.end()) {
+ fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
+ }
+ output.push_back((*token_multibyte).second);
+ }
+ } else {
+ output.push_back((*token).second);
+ }
+ }
+ }
+ }
+
+private:
+ void add_new_bigram(int left, int right) {
+ if (left == -1 || right == -1) {
+ return;
+ }
+
+ std::string left_token = std::string(symbols[left].text, symbols[left].n);
+ std::string right_token = std::string(symbols[right].text, symbols[right].n);
+
+ int rank_found = -1;
+
+ rank_found = vocab.find_bpe_rank(left_token, right_token);
+
+ if (rank_found < 0) {
+ return;
+ }
+
+ llm_bigram_bpe bigram;
+
+ bigram.left = left;
+ bigram.right = right;
+ bigram.text = left_token + right_token;
+ bigram.size = left_token.size() + right_token.size();
+ bigram.rank = rank_found;
+
+ work_queue.push(bigram);
+ }
+
+ // probably not 100% correct
+ // TODO: this is quite slow - how to make it more efficient?
+ static std::vector<std::string> bpe_gpt2_preprocess(std::string text) {
+ std::vector<std::string> words;
+
+ // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
+ const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
+ const std::regex re(pattern);
+ std::smatch m;
+
+ while (std::regex_search(text, m, re)) {
+ for (auto x : m) {
+ words.push_back(x);
+ }
+ text = m.suffix();
+ }
+
+ return words;
+ }
+
+ bool flag_g2ws = false;
+
+ const llama_vocab & vocab;
+
+ std::vector<llm_symbol> symbols;
+ std::vector<llm_symbol> symbols_final;
+
+ llm_bigram_bpe::queue work_queue;
};
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape) {
- llama_tokenizer tokenizer(vocab);
std::vector<llama_vocab::id> output;
if (raw_text.empty()) {
return output;
}
- if (bos) {
- output.push_back(vocab.special_bos_id);
- }
+ switch (vocab.type) {
+ case LLAMA_VOCAB_TYPE_SPM:
+ {
+ llm_tokenizer_spm tokenizer(vocab);
- std::string text;
- if (escape) {
- text = llama_escape_whitespace(raw_text);
- } else {
- text = raw_text;
- }
+ if (bos) {
+ output.push_back(vocab.special_bos_id);
+ }
+
+ std::string text;
+ if (escape) {
+ text = llama_escape_whitespace(raw_text);
+ } else {
+ text = raw_text;
+ }
+
+ tokenizer.tokenize(text, output);
+ } break;
+ case LLAMA_VOCAB_TYPE_BPE:
+ {
+ llm_tokenizer_bpe tokenizer(vocab, escape);
+
+ if (bos && vocab.special_bos_id != -1) {
+ output.push_back(vocab.special_bos_id);
+ }
+
+ tokenizer.tokenize(raw_text, output);
+ } break;
+ };
- tokenizer.tokenize(text, output);
return output;
}
nthread = std::thread::hardware_concurrency();
}
- std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false));
+ std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));
const size_t align = GGUF_DEFAULT_ALIGNMENT;
struct gguf_context * ctx_out = gguf_init_empty();
// copy the KV pairs from the input file
- gguf_set_kv (ctx_out, model_loader->ctx_gguf);
+ gguf_set_kv (ctx_out, ml->ctx_gguf);
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
int n_attention_wv = 0;
int n_feed_forward_w2 = 0;
- for (int i = 0; i < model_loader->n_tensors; ++i) {
- struct ggml_tensor * meta = model_loader->get_tensor_meta(i);
+ for (int i = 0; i < ml->n_tensors; ++i) {
+ struct ggml_tensor * meta = ml->get_tensor_meta(i);
const std::string name = ggml_get_name(meta);
std::vector<uint8_t> work;
// populate the original tensors so we get an initial meta data
- for (int i = 0; i < model_loader->n_tensors; ++i) {
- struct ggml_tensor * meta = model_loader->get_tensor_meta(i);
+ for (int i = 0; i < ml->n_tensors; ++i) {
+ struct ggml_tensor * meta = ml->get_tensor_meta(i);
gguf_add_tensor(ctx_out, meta);
}
// placeholder for the meta data
::zeros(fout, meta_size);
- for (int i = 0; i < model_loader->n_tensors; ++i) {
- struct ggml_tensor * tensor = model_loader->get_tensor_meta(i);
+ for (int i = 0; i < ml->n_tensors; ++i) {
+ struct ggml_tensor * tensor = ml->get_tensor_meta(i);
const std::string name = ggml_get_name(tensor);
read_data.resize(ggml_nbytes(tensor));
tensor->data = read_data.data();
- model_loader->load_data_for(tensor);
+ ml->load_data_for(tensor);
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
- ++idx, model_loader->n_tensors,
+ ++idx, ml->n_tensors,
ggml_get_name(tensor),
llama_format_tensor_shape(tensor).c_str(),
ggml_type_name(tensor->type));
new_type = quantized_type;
#ifdef GGML_USE_K_QUANTS
// TODO: avoid hardcoded tensor names - use the TN_* constants
- if (name == TN_OUTPUT) {
+ const auto tn = LLM_TN(ml->get_arch());
+
+ if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
int nx = tensor->ne[0];
int ny = tensor->ne[1];
if (nx % QK_K == 0 && ny % QK_K == 0) {
}
}
if (convert_incompatible_tensor) {
- if (name == TN_OUTPUT) {
+ if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
- } else if (name == TN_TOKEN_EMBD) {
+ } else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
} else {
}
// load base model
- std::unique_ptr<llama_model_loader> model_loader;
+ std::unique_ptr<llama_model_loader> ml;
ggml_context * base_ctx = NULL;
std::vector<uint8_t> base_buf;
if (path_base_model) {
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
- model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
+ ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
size_t ctx_size;
size_t mmapped_size;
- model_loader->calc_sizes(ctx_size, mmapped_size);
+ ml->calc_sizes(ctx_size, mmapped_size);
base_buf.resize(ctx_size);
ggml_init_params base_params;
base_params.mem_size = base_buf.size();
base_params.mem_buffer = base_buf.data();
- base_params.no_alloc = model_loader->use_mmap;
+ base_params.no_alloc = ml->use_mmap;
base_ctx = ggml_init(base_params);
// maybe this should in llama_model_loader
- if (model_loader->use_mmap) {
- model_loader->mapping.reset(new llama_mmap(&model_loader->file, /* prefetch */ 0, ggml_is_numa()));
+ if (ml->use_mmap) {
+ ml->mapping.reset(new llama_mmap(&ml->file, /* prefetch */ 0, ggml_is_numa()));
}
}
#endif // GGML_USE_CUBLAS
ggml_tensor * base_t;
- if (model_loader) {
- struct gguf_context * ctx_gguf = model_loader->ctx_gguf;
+ if (ml) {
+ struct gguf_context * ctx_gguf = ml->ctx_gguf;
// load from base model
if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) {
+ // TODO: throw
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
return 1;
}
// TODO: not tested!! maybe not working!
- base_t = model_loader->create_tensor(base_ctx, base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
- model_loader->load_data_for(base_t);
+ base_t = ml->create_tensor(base_ctx, base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
+ ml->load_data_for(base_t);
} else {
base_t = dest_t;
}
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
- if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
+ unsigned cur_percentage = 0;
+ if (params.progress_callback == NULL) {
+ params.progress_callback_user_data = &cur_percentage;
+ params.progress_callback = [](float progress, void * ctx) {
+ unsigned * cur_percentage_p = (unsigned *) ctx;
+ unsigned percentage = (unsigned) (100 * progress);
+ while (percentage > *cur_percentage_p) {
+ *cur_percentage_p = percentage;
+ LLAMA_LOG_INFO(".");
+ if (percentage >= 100) {
+ LLAMA_LOG_INFO("\n");
+ }
+ }
+ };
+ }
+
+ if (!llama_model_load(path_model, *model, params.n_ctx, params.n_batch, params.n_gpu_layers,
params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,
params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only,
params.progress_callback, params.progress_callback_user_data)) {
params.seed = time(NULL);
}
- unsigned cur_percentage = 0;
- if (params.progress_callback == NULL) {
- params.progress_callback_user_data = &cur_percentage;
- params.progress_callback = [](float progress, void * ctx) {
- unsigned * cur_percentage_p = (unsigned *) ctx;
- unsigned percentage = (unsigned) (100 * progress);
- while (percentage > *cur_percentage_p) {
- *cur_percentage_p = percentage;
- LLAMA_LOG_INFO(".");
- if (percentage >= 100) {
- LLAMA_LOG_INFO("\n");
- }
- }
- };
- }
-
ctx->rng = std::mt19937(params.seed);
ctx->logits_all = params.logits_all;
struct llama_context * llama_init_from_file(
const char * path_model,
struct llama_context_params params) {
-
struct llama_model * model = llama_load_model_from_file(path_model, params);
if (!model) {
return nullptr;
}
+
struct llama_context * ctx = llama_new_context_with_model(model, params);
ctx->model_owner = true;
+
return ctx;
}
return ctx->model.hparams.n_embd;
}
+enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) {
+ return ctx->model.vocab.type;
+}
+
int llama_model_n_vocab(const struct llama_model * model) {
return model->vocab.id_to_token.size();
}
}
int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) {
- return snprintf(buf, buf_size, "LLaMA %s %s", llama_model_type_name(model->type), llama_model_ftype_name(model->ftype).c_str());
+ return snprintf(buf, buf_size, "%s %s %s",
+ model->name.c_str(),
+ llama_model_type_name(model->type),
+ llama_model_ftype_name(model->ftype).c_str());
}
int llama_model_quantize(
return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
}
-int llama_tokenize_bpe(
- struct llama_context * ctx,
- const char * text,
- llama_token * tokens,
- int n_max_tokens,
- bool add_bos) {
- auto res = llama_tokenize_internal(ctx->model.vocab, text, add_bos, false);
-
- if (n_max_tokens < (int) res.size()) {
- LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
- return -((int) res.size());
- }
-
- for (size_t i = 0; i < res.size(); i++) {
- tokens[i] = res[i];
- }
-
- return res.size();
-}
-
int llama_tokenize_with_model(
const struct llama_model * model,
const char * text,
return llama_token_to_str_with_model(&ctx->model, token, buf, length);
}
-int llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token, char * buf, int length) {
- if (0 <= token && token < llama_model_n_vocab(&ctx->model)) {
- std::string result = ctx->model.vocab.id_to_token[token].text;
- if (length < (int) result.length()) {
- return -result.length();
- }
- memcpy(buf, result.c_str(), result.length());
- return result.length();
- }
- return 0;
-}
-
// does not write null-terminator to str
int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
if (0 <= token && token < llama_model_n_vocab(model)) {