id: cmake_test
run: |
cd build
- ctest -L main --verbose --timeout 900
+ ctest -L main -E "test-llama-archs" --verbose --timeout 900
macOS-latest-cmake-x64:
runs-on: macos-15-intel
[](common_params & params, const std::string & value) {
params.out_file = value;
}
- ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE}));
+ ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_FINETUNE, LLAMA_EXAMPLE_RESULTS}));
add_opt(common_arg(
{"-ofreq", "--output-frequency"}, "N",
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
}
}
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
+ add_opt(common_arg(
+ {"--check"},
+ string_format("check rather than generate results (default: %s)", params.check ? "true" : "false"),
+ [](common_params & params) {
+ params.check = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_RESULTS}));
add_opt(common_arg(
{"--save-logits"},
string_format("save final logits to files for verification (default: %s)", params.save_logits ? "true" : "false"),
LLAMA_EXAMPLE_DIFFUSION,
LLAMA_EXAMPLE_FINETUNE,
LLAMA_EXAMPLE_FIT_PARAMS,
+ LLAMA_EXAMPLE_RESULTS,
LLAMA_EXAMPLE_COUNT,
};
bool kl_divergence = false; // compute KL divergence
+ bool check = false; // check rather than generate results for llama-results
+
bool usage = false; // print usage
bool completion = false; // print source-able completion script
bool use_color = false; // use color to distinguish generations and inputs
#include "ggml-cpu.h"
#include "ggml-backend.h"
#include "ggml-opt.h"
+#include "gguf.h"
#include <stddef.h>
#include <stdint.h>
LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
+ typedef void (*llama_model_set_tensor_data_t)(struct ggml_tensor * tensor, void * userdata);
+
+ // Create a new model from GGUF metadata as well as a function to set the tensor data
+ // - tensors are created as GGML_TYPE_F32 by default,
+ // override by adding a tensor with the same name but a different name to the context
+ LLAMA_API struct llama_model * llama_model_init_from_user(
+ struct gguf_context * metadata,
+ llama_model_set_tensor_data_t set_tensor_data, // function to initialize tensor data with
+ void * set_tensor_data_ud, // userdata for function
+ struct llama_model_params params);
+
DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
const char * path_model,
struct llama_model_params params),
"use llama_model_load_from_file instead");
- // Load the model from a file
+ // Load a model from a file
// If the file is split into multiple parts, the file name must follow this pattern: <name>-%05d-of-%05d.gguf
// If the split file name does not follow this pattern, use llama_model_load_from_splits
LLAMA_API struct llama_model * llama_model_load_from_file(
const char * path_model,
struct llama_model_params params);
- // Load the model from multiple splits (support custom naming scheme)
+ // Load a model from multiple splits (support custom naming scheme)
// The paths must be in the correct order
LLAMA_API struct llama_model * llama_model_load_from_splits(
const char ** paths,
--- /dev/null
+#!/usr/bin/env bash
+
+cmake_args=()
+llama_results_args=()
+
+for arg in "${@}"; do
+ if [[ "$arg" == -D* ]]; then
+ cmake_args+=("$arg")
+ else
+ llama_results_args+=("$arg")
+ fi
+done
+
+dir="build-bisect"
+rm -rf ${dir} > /dev/null
+cmake -B ${dir} -S . ${cmake_args} > /dev/null
+cmake --build ${dir} -t llama-results -j $(nproc) > /dev/null
+${dir}/bin/llama-results "${llama_results_args[@]}"
--- /dev/null
+#!/usr/bin/env bash
+
+if [ $# -lt 2 ]; then
+ echo "usage: ./scripts/git-bisect.sh <commit_bad> <commit_good> [additional arguments]"
+ echo " additional arguments: passed to CMake if they start with \"-D\", to llama-results otherwise"
+ exit 1
+fi
+
+set -e
+set -x
+
+commit_bad=$1
+commit_good=$2
+script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+git checkout ${commit_good}
+${script_dir}/git-bisect-run.sh --output results.gguf "${@:3}"
+git bisect start ${commit_bad} ${commit_good}
+git bisect run ${script_dir}/git-bisect-run.sh --output results.gguf --check "${@:3}"
+git bisect reset
#include <map>
#include <set>
+#include <vector>
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_CLIP, "clip" }, // dummy, only used by llama-quantize
return name;
}
+std::vector<llm_arch> llm_arch_all() {
+ std::vector<llm_arch> ret;
+ ret.reserve(LLM_ARCH_NAMES.size());
+ for (const auto & [arch, _] : LLM_ARCH_NAMES) {
+ ret.push_back(arch);
+ }
+ return ret;
+}
+
const char * llm_arch_name(llm_arch arch) {
auto it = LLM_ARCH_NAMES.find(arch);
if (it == LLM_ARCH_NAMES.end()) {
#include <string>
#include <set>
+#include <vector>
//
// gguf constants (sync with gguf.py)
ggml_op op;
};
+std::vector<llm_arch> llm_arch_all();
+
const char * llm_arch_name(llm_arch arch);
llm_arch llm_arch_from_string(const std::string & name);
{
//const auto t_start_us = ggml_time_us();
+ // FIXME this call causes a crash if any model inputs were not used in the graph and were therefore not allocated
res->set_inputs(&ubatch);
//LLAMA_LOG_INFO("graph set inputs time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
float * data = (float *) cross_kq_mask->data;
for (int i = 0; i < n_tokens; ++i) {
+ GGML_ASSERT(!cross->seq_ids_enc.empty() && "llama_encode must be called first");
for (int j = 0; j < n_enc; ++j) {
float f = -INFINITY;
return cur;
}
+// TODO remove redundant scale_w argument
ggml_tensor * llm_graph_context::build_moe_ffn(
ggml_tensor * cur,
ggml_tensor * gate_inp,
// this need to be 1x1xN for broadcasting
cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);
ggml_set_input(cur);
+ ggml_set_name(cur, "attn_scale");
res->add_input(std::move(inp));
#include "llama-model-loader.h"
+#include "ggml-alloc.h"
#include "ggml.h"
+#include "gguf.h"
+#include "llama-hparams.h"
#include <algorithm>
#include <array>
#include <cinttypes>
+#include <cstdint>
#include <cstring>
#include <future>
+#include <regex>
static const size_t kiB = 1024;
static const size_t MiB = 1024*kiB;
template<typename T>
typename std::enable_if<std::is_integral<T>::value, bool>::type
llama_model_loader::get_arr_n(const std::string & key, T & result, bool required) {
- const int kid = gguf_find_key(meta.get(), key.c_str());
+ const int kid = gguf_find_key(metadata, key.c_str());
if (kid < 0) {
if (required) {
}
struct GGUFMeta::ArrayInfo arr_info =
- GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(metadata, kid);
result = arr_info.length;
template<typename T>
bool llama_model_loader::get_arr(const std::string & key, std::vector<T> & result, bool required) {
- const gguf_context * ctx = meta.get();
+ const gguf_context * ctx = metadata;
const int kid = gguf_find_key(ctx, key.c_str());
if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
template<typename T, size_t N_MAX>
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
- const gguf_context * ctx = meta.get();
+ const gguf_context * ctx = metadata;
const int kid = gguf_find_key(ctx, key.c_str());
if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
const struct llama_model_kv_override * override =
it != kv_overrides.end() ? &it->second : nullptr;
- const bool found = GGUFMeta::GKV<T>::set(meta.get(), key, result, override);
+ const bool found = GGUFMeta::GKV<T>::set(metadata, key, result, override);
if (required && !found) {
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
// get array of n <= N_MAX elements, or a single element repeated n times
template<typename T, size_t N_MAX>
bool llama_model_loader::get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, bool required) {
- const int kid = gguf_find_key(meta.get(), key.c_str());
+ const int kid = gguf_find_key(metadata, key.c_str());
if (kid < 0) {
if (required) {
throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str()));
}
- if (gguf_get_kv_type(meta.get(), kid) == GGUF_TYPE_ARRAY) {
+ if (gguf_get_kv_type(metadata, kid) == GGUF_TYPE_ARRAY) {
struct GGUFMeta::ArrayInfo arr_info =
- GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
+ GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(metadata, kid);
if (n != arr_info.length) {
throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length));
bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) {
const std::string key = llm_kv(kid);
- const int id = gguf_find_key(meta.get(), key.c_str());
+ const int id = gguf_find_key(metadata, key.c_str());
if (id < 0) {
if (required) {
}
// throw and error if type is an array
- if (gguf_get_kv_type(meta.get(), id) == GGUF_TYPE_ARRAY) {
+ if (gguf_get_kv_type(metadata, id) == GGUF_TYPE_ARRAY) {
if (required) {
throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str()));
}
llama_model_loader::llama_model_loader(
+ struct gguf_context * meta,
+ llama_model_set_tensor_data_t set_tensor_data,
+ void * set_tensor_data_ud,
const std::string & fname,
std::vector<std::string> & splits,
bool use_mmap,
bool check_tensors,
bool no_alloc,
const llama_model_kv_override * param_overrides_p,
- const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) {
+ const llama_model_tensor_buft_override * param_tensor_buft_overrides_p)
+ : metadata(meta), set_tensor_data(set_tensor_data), set_tensor_data_ud(set_tensor_data_ud) {
int trace = 0;
if (getenv("LLAMA_TRACE")) {
trace = atoi(getenv("LLAMA_TRACE"));
tensor_buft_overrides = param_tensor_buft_overrides_p;
- // Load the main GGUF
- struct ggml_context * ctx = NULL;
- struct gguf_init_params params = {
- /*.no_alloc = */ true,
- /*.ctx = */ &ctx,
- };
-
- meta.reset(gguf_init_from_file(fname.c_str(), params));
- if (!meta) {
- throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
- }
-
- get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
- llm_kv = LLM_KV(llm_arch_from_string(arch_name));
-
- files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
- contexts.emplace_back(ctx);
+ if (!fname.empty()) {
+ // Load the main GGUF
+ struct ggml_context * ctx = NULL;
+ struct gguf_init_params params = {
+ /*.no_alloc = */ true,
+ /*.ctx = */ &ctx,
+ };
- if (use_mmap && use_direct_io) {
- if (files.back()->has_direct_io()) {
- LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
- use_mmap = false;
- } else {
- LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
- use_direct_io = false;
-
- // reopen file using std::fopen for mmap
- files.pop_back();
- files.emplace_back(new llama_file(fname.c_str(), "rb", false));
+ metadata_ptr.reset(gguf_init_from_file(fname.c_str(), params));
+ metadata = metadata_ptr.get();
+ if (metadata == nullptr) {
+ throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
}
- }
- // Save tensors data offset of the main file.
- // For subsidiary files, `meta` tensor data offset must not be used,
- // so we build a unified tensors index for weights.
- for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
- std::string tensor_name = std::string(cur->name);
- // make sure there is no duplicated tensor names
- if (weights_map.find(tensor_name) != weights_map.end()) {
- throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
- }
- n_elements += ggml_nelements(cur);
- n_bytes += ggml_nbytes(cur);
- weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, meta.get(), cur));
- }
- uint16_t n_split = 0;
- get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
+ get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
+ llm_kv = LLM_KV(llm_arch_from_string(arch_name));
- // Load additional GGML contexts
- if (n_split > 1) {
- // make sure the main file is loaded first
- uint16_t idx = 0;
- const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO);
- get_key(kv_split_no, idx);
- if (idx != 0) {
- throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str()));
- }
+ files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
+ contexts.emplace_back(ctx);
- // generate list of splits if needed
- if (splits.empty()) {
- splits = llama_get_list_splits(fname, idx, n_split);
- }
+ if (use_mmap && use_direct_io) {
+ if (files.back()->has_direct_io()) {
+ LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
+ use_mmap = false;
+ } else {
+ LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
+ use_direct_io = false;
- // in case user give a custom list of splits, check if it matches the expected number
- if (n_split != (uint16_t)splits.size()) {
- throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split));
+ // reopen file using std::fopen for mmap
+ files.pop_back();
+ files.emplace_back(new llama_file(fname.c_str(), "rb", false));
+ }
}
- if (trace > 0) {
- LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
- }
+ // Save tensors data offset of the main file.
+ // For subsidiary files, `meta` tensor data offset must not be used,
+ // so we build a unified tensors index for weights.
+ for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
+ std::string tensor_name = std::string(cur->name);
+ // make sure there is no duplicated tensor names
+ if (weights_map.find(tensor_name) != weights_map.end()) {
+ throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
+ }
+ n_elements += ggml_nelements(cur);
+ n_bytes += ggml_nbytes(cur);
+ weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), 0, metadata, cur));
+ }
+ uint16_t n_split = 0;
+ get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
+
+ // Load additional GGML contexts
+ if (n_split > 1) {
+ // make sure the main file is loaded first
+ uint16_t idx = 0;
+ const std::string kv_split_no = llm_kv(LLM_KV_SPLIT_NO);
+ get_key(kv_split_no, idx);
+ if (idx != 0) {
+ throw std::runtime_error(format("illegal split file idx: %d (file: %s), model must be loaded with the first split", idx, fname.c_str()));
+ }
- // load other splits
- for (idx = 1; idx < n_split; idx++) {
- const char * fname_split = splits[idx].c_str();
+ // generate list of splits if needed
+ if (splits.empty()) {
+ splits = llama_get_list_splits(fname, idx, n_split);
+ }
- struct gguf_init_params split_params = {
- /*.no_alloc = */ true,
- /*.ctx = */ &ctx,
- };
- gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
- if (!ctx_gguf) {
- throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
+ // in case user give a custom list of splits, check if it matches the expected number
+ if (n_split != (uint16_t)splits.size()) {
+ throw std::runtime_error(format("invalid split count, given: %zu splits, but expected %d", splits.size(), n_split));
}
- // check idx
- {
- const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str());
- if (kid < 0) {
- throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
+ if (trace > 0) {
+ LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
+ }
+
+ // load other splits
+ for (idx = 1; idx < n_split; idx++) {
+ const char * fname_split = splits[idx].c_str();
+
+ struct gguf_init_params split_params = {
+ /*.no_alloc = */ true,
+ /*.ctx = */ &ctx,
+ };
+ gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
+ if (!ctx_gguf) {
+ throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
}
- int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid);
- if (idx_gguf != idx) {
- throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
+
+ // check idx
+ {
+ const int kid = gguf_find_key(ctx_gguf.get(), kv_split_no.c_str());
+ if (kid < 0) {
+ throw std::runtime_error(format("missing key %s in GGUF split %s", kv_split_no.c_str(), fname_split));
+ }
+ int idx_gguf = gguf_get_val_u16(ctx_gguf.get(), kid);
+ if (idx_gguf != idx) {
+ throw std::runtime_error(format("invalid split file idx: %d (file: %s), expected %d", idx_gguf, fname_split, idx));
+ }
}
- }
- files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
- contexts.emplace_back(ctx);
+ files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
+ contexts.emplace_back(ctx);
- // Save tensors data offset info of the shard.
- for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
- std::string tensor_name = std::string(cur->name);
- // make sure there is no duplicated tensor names
- if (weights_map.find(tensor_name) != weights_map.end()) {
- throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
+ // Save tensors data offset info of the shard.
+ for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
+ std::string tensor_name = std::string(cur->name);
+ // make sure there is no duplicated tensor names
+ if (weights_map.find(tensor_name) != weights_map.end()) {
+ throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
+ }
+ n_elements += ggml_nelements(cur);
+ n_bytes += ggml_nbytes(cur);
+ weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur));
}
- n_elements += ggml_nelements(cur);
- n_bytes += ggml_nbytes(cur);
- weights_map.emplace(tensor_name, llama_tensor_weight(files.back().get(), idx, ctx_gguf.get(), cur));
}
- }
- get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
+ get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
- // sanity check
- {
- const int n_tensors_loaded = (int) weights_map.size();
- if (n_tensors != n_tensors_loaded) {
- throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
+ // sanity check
+ {
+ const int n_tensors_loaded = (int) weights_map.size();
+ if (n_tensors != n_tensors_loaded) {
+ throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
+ }
}
- }
- LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
+ LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
+ }
+ } else {
+ get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
+ llm_kv = LLM_KV(llm_arch_from_string(arch_name));
}
- n_kv = gguf_get_n_kv(meta.get());
+ n_kv = gguf_get_n_kv(metadata);
n_tensors = weights_map.size();
- fver = (enum llama_fver) gguf_get_version(meta.get());
+ fver = (enum llama_fver) gguf_get_version(metadata);
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
__func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
for (int i = 0; i < n_kv; i++) {
- const char * name = gguf_get_key(meta.get(), i);
- const enum gguf_type type = gguf_get_kv_type(meta.get(), i);
+ const char * name = gguf_get_key(metadata, i);
+ const enum gguf_type type = gguf_get_kv_type(metadata, i);
const std::string type_name =
type == GGUF_TYPE_ARRAY
- ? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta.get(), i)), gguf_get_arr_n(meta.get(), i))
+ ? format("%s[%s,%zu]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(metadata, i)), gguf_get_arr_n(metadata, i))
: gguf_type_name(type);
- std::string value = gguf_kv_to_str(meta.get(), i);
+ std::string value = gguf_kv_to_str(metadata, i);
const size_t MAX_VALUE_LEN = 40;
if (value.size() > MAX_VALUE_LEN) {
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
return cur;
}
-struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) {
- LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, name.c_str());
- const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
+// checks if the weight tensor can be used with the specified buffer type and device
+static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
+ GGML_ASSERT(w != nullptr);
+
+ if (op == GGML_OP_NONE) {
+ return true;
+ }
+
+ ggml_init_params params = {
+ /*.mem_size =*/ ggml_tensor_overhead()*8,
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ true,
+ };
+ ggml_context_ptr ctx_ptr { ggml_init(params) };
+ if (!ctx_ptr) {
+ throw std::runtime_error(format("failed to create ggml context"));
+ }
+ ggml_context * ctx = ctx_ptr.get();
+
+ ggml_tensor * op_tensor = nullptr;
+
+ switch (op) {
+ case GGML_OP_GET_ROWS:
+ {
+ ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
+ op_tensor = ggml_get_rows(ctx, w, b);
+ } break;
+ case GGML_OP_MUL_MAT:
+ {
+ ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
+ op_tensor = ggml_mul_mat(ctx, w, b);
+ } break;
+ case GGML_OP_MUL_MAT_ID:
+ {
+ const int n_expert_used = hparams.n_expert_used;
+ GGML_ASSERT(n_expert_used > 0);
+ ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
+ ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
+ op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
+ } break;
+ case GGML_OP_ADD:
+ {
+ ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
+ op_tensor = ggml_add(ctx, a, w);
+ } break;
+ case GGML_OP_ADD_ID:
+ {
+ const int n_expert_used = hparams.n_expert_used;
+ GGML_ASSERT(n_expert_used > 0);
+ ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
+ ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
+ op_tensor = ggml_add_id(ctx, a, w, c);
+ } break;
+ case GGML_OP_MUL:
+ {
+ ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
+ op_tensor = ggml_mul(ctx, a, w);
+ } break;
+ case GGML_OP_DIV:
+ {
+ ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
+ op_tensor = ggml_div(ctx, a, w);
+ } break;
+ case GGML_OP_ROPE:
+ {
+ const int n_embd_head = hparams.n_embd_head_v;
+ const int n_head = hparams.n_head();
+ ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
+ ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
+ op_tensor = ggml_rope_ext(
+ ctx, a, b, w,
+ 0, 0, 0, 0, 0,
+ 0, 0, 0, 0
+ );
+
+ } break;
+ case GGML_OP_SSM_CONV:
+ {
+ const int64_t n_seq_tokens = 512;
+ const int64_t n_seqs = 3;
+ ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
+ op_tensor = ggml_ssm_conv(ctx, conv_x, w);
+ } break;
+ case GGML_OP_SSM_SCAN:
+ {
+ // w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
+ const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
+ const int64_t n_head = w->ne[1];
+ const int64_t head_dim = hparams.ssm_d_inner / n_head;
+ const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
+ const int64_t n_seq_tokens = 512;
+ const int64_t n_seqs = 3;
+ ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
+ ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
+ ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
+ ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
+ ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
+ ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
+ op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
+ } break;
+ case GGML_OP_RWKV_WKV6:
+ {
+ // FIXME
+ const int64_t S = 123;
+ const int64_t H = 123;
+ const int64_t n_tokens = 123;
+ const int64_t n_seqs = 123;
+ ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
+ ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
+ ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
+ ggml_tensor * tf = w;
+ ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
+ ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
+ op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
+ } break;
+ case GGML_OP_IM2COL:
+ {
+ const int n_embd_inp = hparams.n_embd_inp();
+ ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1);
+ op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
+ } break;
+ case GGML_OP_SCALE:
+ {
+ op_tensor = ggml_scale(ctx, w, 1.0f);
+ } break;
+ default:
+ GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
+ }
+
+ // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
+ GGML_ASSERT(w->buffer == nullptr);
+ w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
+ bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
+ ggml_backend_buffer_free(w->buffer);
+ w->buffer = nullptr;
+
+ return op_supported;
+}
+
+// find the first buffer type in the list that can use the tensor
+static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t * buft_list) {
+ GGML_ASSERT(!buft_list->empty());
+ for (const auto & cur : *buft_list) {
+ ggml_backend_dev_t cur_dev = cur.first;
+ ggml_backend_buffer_type_t cur_buft = cur.second;
+ if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
+ return cur_buft;
+ }
+ }
+
+ return nullptr;
+}
+
+struct ggml_tensor * llama_model_loader::create_tensor(
+ const llama_hparams & hparams, const buft_list_t * buft_list_cpu, const buft_list_t * buft_list_input, const buft_list_t * buft_list_output,
+ const buft_list_t * buft_list_layer, const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) {
+ auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+ auto it = ctx_map.find(buft);
+ if (it == ctx_map.end()) {
+ // one ggml context per buffer type
+ int max_n_tensors = n_tensors;
+ max_n_tensors += 1; // duplicated output tensor
+ max_n_tensors += hparams.n_layer*2; // duplicated rope freq tensors
+ if (files.empty()) {
+ max_n_tensors += hparams.n_layer*256; // this should be well above what any model actually uses
+ }
+ const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
+
+ ggml_init_params params = {
+ /*.mem_size =*/ ctx_size,
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ true,
+ };
+
+ ggml_context * ctx = ggml_init(params);
+ if (!ctx) {
+ throw std::runtime_error(format("failed to create ggml context"));
+ }
+
+ ctx_map.emplace(buft, ctx);
+
+ return ctx;
+ }
+ return it->second.get();
+ };
+
+ auto buft_for_tensor = [&](ggml_tensor * t_meta) -> ggml_backend_buffer_type_t {
+ if (!t_meta) {
+ if (flags & TENSOR_NOT_REQUIRED) {
+ return nullptr;
+ }
+ throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
+ }
+
+ // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
+ // the tensor is duplicated
+ // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
+ llm_tensor tn_tensor = tn.tensor;
+ if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && (flags & TENSOR_DUPLICATED)) {
+ tn_tensor = LLM_TENSOR_OUTPUT;
+ }
+
+ llm_tensor_info info;
+ try {
+ info = llm_tensor_info_for(tn_tensor);
+ } catch (const std::out_of_range & e) {
+ throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
+ }
+
+ // skip unused tensors
+ if (info.op == GGML_OP_NONE || (flags & TENSOR_SKIP)) {
+ const size_t nbytes = ggml_nbytes(t_meta);
+ LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
+
+ size_data -= nbytes;
+ n_created++;
+
+ return nullptr;
+ }
+
+ // tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
+ ggml_op op;
+ bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
+ if (bias) {
+ if (info.op == GGML_OP_MUL_MAT_ID) {
+ op = GGML_OP_ADD_ID;
+ } else {
+ op = GGML_OP_ADD;
+ }
+ } else {
+ op = info.op;
+ }
+
+ // sanity checks
+ if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
+ if (tn.bid != -1) {
+ GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
+ }
+ } else {
+ if (tn.bid == -1) {
+ GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
+ }
+ }
+
+ // select the buffer type for this tensor
+ const buft_list_t * buft_list;
+ switch (info.layer) {
+ case LLM_TENSOR_LAYER_INPUT:
+ buft_list = buft_list_input;
+ break;
+ case LLM_TENSOR_LAYER_OUTPUT:
+ buft_list = buft_list_output;
+ break;
+ case LLM_TENSOR_LAYER_REPEATING:
+ GGML_ASSERT(buft_list_layer != nullptr);
+ buft_list = buft_list_layer;
+ break;
+ default:
+ GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
+ }
+
+ ggml_backend_buffer_type_t buft = nullptr;
+
+ // check overrides
+ if (tensor_buft_overrides) {
+ std::string tensor_name = tn.str();
+ for (const auto * overrides = tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
+ std::regex pattern(overrides->pattern);
+ if (std::regex_search(tensor_name, pattern)) {
+ if (overrides->buft == ggml_backend_cpu_buffer_type()) {
+ // when overriding to a CPU buffer, consider the extra buffer types
+ buft = select_weight_buft(hparams, t_meta, op, buft_list_cpu);
+ } else {
+ buft = overrides->buft;
+ }
+
+ LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
+ tensor_name.c_str(),
+ ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
+ ggml_backend_buft_name(buft));
+ break;
+ }
+ }
+ }
+
+ if (!buft) {
+ buft = select_weight_buft(hparams, t_meta, op, buft_list);
+ if (!buft) {
+ throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
+ }
+ }
+
+ // avoid using a host buffer when using mmap
+ auto * buft_dev = ggml_backend_buft_get_device(buft);
+ if (use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+ if (!cpu_dev) {
+ throw std::runtime_error("no CPU backend found");
+ }
+ buft = ggml_backend_dev_buffer_type(cpu_dev);
+ }
+
+ if (buft != buft_list->front().second) {
+ if (n_tensors_moved == 0) {
+ first_tensor_moved_name = t_meta->name;
+ first_tensor_moved_type_name = ggml_type_name(t_meta->type);
+ first_moved_from_buft = buft_list->front().second;
+ first_moved_to_buft = buft;
+ }
+ n_tensors_moved++;
+ }
+
+ return buft;
+ };
+
+ if (files.empty()) {
+ if (flags & TENSOR_SKIP_IF_VIRTUAL) {
+ return nullptr;
+ }
+ ggml_type type = GGML_TYPE_F32;
+ const int64_t tid = gguf_find_tensor(metadata, tn.str().c_str());
+ if (tid != -1) {
+ type = gguf_get_tensor_type(metadata, tid);
+ }
+
+ // for tensors that are not required some of the dimensions can be invalid:
+ if (flags & TENSOR_NOT_REQUIRED) {
+ for (size_t dim = 0; dim < ne.size(); dim++) {
+ if (ne.begin()[dim] <= 0) {
+ return nullptr;
+ }
+ }
+ }
+
+ ggml_tensor t_meta;
+ memset(&t_meta, 0, sizeof(ggml_tensor));
+ t_meta.type = type;
+ for (size_t dim = 0; dim < GGML_MAX_DIMS; dim++) {
+ t_meta.ne[dim] = dim < ne.size() ? ne.begin()[dim] : 1;
+ GGML_ASSERT(t_meta.ne[dim] >= 1);
+ t_meta.nb[dim] = dim == 0 ? ggml_type_size(type) : t_meta.ne[dim-1]*t_meta.nb[dim-1];
+ GGML_ASSERT(t_meta.nb[dim] >= 1);
+ }
+ ggml_set_name(&t_meta, tn.str().c_str());
+
+ ggml_backend_buffer_type_t buft = buft_for_tensor(&t_meta);
+ GGML_ASSERT(buft != nullptr);
+ ggml_context * ctx = ctx_for_buft(buft);
+ ggml_tensor * ret = ggml_dup_tensor(ctx, &t_meta);
+ ggml_set_name(ret, tn.str().c_str());
+ return ret;
+ }
+
+ ggml_tensor * t_meta = get_tensor_meta(tn.str().c_str());
+ ggml_backend_buffer_type_t buft = buft_for_tensor(t_meta);
+ if (buft == nullptr) {
+ return nullptr; // return type is ggml_tensor *
+ }
+ ggml_context * ctx = ctx_for_buft(buft);
+
+ // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
+ if (flags & TENSOR_DUPLICATED) {
+ ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
+ if (t) {
+ return t;
+ }
+ }
+
+ LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, tn.str().c_str());
+ const struct ggml_tensor * cur = check_tensor_dims(tn.str(), ne, !(flags & TENSOR_NOT_REQUIRED));
if (cur == NULL) {
return NULL;
}
- bool duplicated = flags & TENSOR_DUPLICATED;
+ const bool duplicated = flags & TENSOR_DUPLICATED;
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
ggml_set_name(tensor, ggml_get_name(cur));
}
return tensor;
-
}
struct ggml_tensor * llama_model_loader::create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required) {
if (n_created != n_tensors) {
throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
}
+ if (n_tensors_moved > 0) {
+ LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %zu others) cannot be used with preferred buffer type %s, using %s instead\n",
+ __func__, first_tensor_moved_name.c_str(), first_tensor_moved_type_name.c_str(), n_tensors_moved - 1,
+ ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
+ }
}
void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps) {
llama_mlocks * lmlocks,
llama_progress_callback progress_callback,
void * progress_callback_user_data) {
+ if (files.empty()) {
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+ set_tensor_data(t, set_tensor_data_ud);
+ }
+ return true;
+ }
GGML_ASSERT(size_data != 0 && "call init_mappings() first");
std::vector<no_init<uint8_t>> read_buf;
#include "llama-impl.h"
#include "llama-arch.h"
+#include "llama-hparams.h"
#include "llama-mmap.h"
#include "ggml-cpp.h"
#include <cstddef>
+#include <cstring>
#include <map>
#include <stdexcept>
#include <unordered_map>
using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
+// lists of buffer types used for each layer
+using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
+
enum llama_fver {
GGUF_FILE_VERSION_V1 = 1,
GGUF_FILE_VERSION_V2 = 2,
}
};
- static const int TENSOR_NOT_REQUIRED = 1 << 0;
- static const int TENSOR_DUPLICATED = 1 << 1;
- static const int TENSOR_SKIP = 1 << 2;
+ static const int TENSOR_NOT_REQUIRED = 1 << 0;
+ static const int TENSOR_DUPLICATED = 1 << 1;
+ static const int TENSOR_SKIP = 1 << 2;
+ static const int TENSOR_SKIP_IF_VIRTUAL = 1 << 3;
int n_kv = 0;
int n_tensors = 0;
std::unordered_map<std::string, llama_model_kv_override> kv_overrides;
const llama_model_tensor_buft_override * tensor_buft_overrides;
- gguf_context_ptr meta;
+ gguf_context_ptr metadata_ptr;
+ struct gguf_context * metadata; // either metadata_ptr.get() or externally set
+ llama_model_set_tensor_data_t set_tensor_data;
+ void * set_tensor_data_ud;
std::vector<ggml_context_ptr> contexts;
std::string arch_name;
size_t size_data = 0;
std::vector<std::pair<size_t, size_t>> mmaps_used;
+ // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
+ struct ggml_backend_buft_comparator {
+ bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
+ return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
+ }
+ };
+
+ std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
+
+ // track tensors that had to be moved for debugging:
+ size_t n_tensors_moved = 0;
+ std::string first_tensor_moved_name;
+ std::string first_tensor_moved_type_name;
+ ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
+ ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
+
llama_model_loader(
+ struct gguf_context * metadata,
+ llama_model_set_tensor_data_t set_tensor_data,
+ void * set_tensor_data_ud,
const std::string & fname,
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
bool use_mmap,
const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const;
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags = 0);
+ struct ggml_tensor * create_tensor(
+ const llama_hparams & hparams, const buft_list_t * buft_list_cpu, const buft_list_t * buft_list_input, const buft_list_t * buft_list_output,
+ const buft_list_t * buft_list_layer, const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags);
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::initializer_list<int64_t> & ne, size_t offset, bool required = true);
#include "llama-model.h"
#include "llama-vocab.h"
+#include <cstdint>
#include <string>
-llama_model_saver::llama_model_saver(const struct llama_model & model) : model(model), llm_kv(model.arch) {
- gguf_ctx = gguf_init_empty();
-}
+llama_model_saver::llama_model_saver(const struct llama_model * model) :
+ gguf_ctx(gguf_init_empty()), gguf_ctx_owned(true), model(model), llm_kv(model->arch) {}
+
+llama_model_saver::llama_model_saver(enum llm_arch arch, struct gguf_context * gguf_ctx) :
+ gguf_ctx(gguf_ctx == nullptr ? gguf_init_empty() : gguf_ctx), gguf_ctx_owned(gguf_ctx == nullptr), model(nullptr), llm_kv(arch) {}
llama_model_saver::~llama_model_saver() {
- gguf_free(gguf_ctx);
+ if (gguf_ctx_owned) {
+ gguf_free(gguf_ctx);
+ }
}
void llama_model_saver::add_kv(const enum llm_kv key, const uint32_t value) {
template <typename Container>
void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, const bool per_layer) {
- const size_t n_values = per_layer ? size_t(model.hparams.n_layer) : value.size();
+ GGML_ASSERT(model != nullptr || !per_layer);
+ const size_t n_values = per_layer ? size_t(model->hparams.n_layer) : value.size();
GGML_ASSERT(n_values <= value.size());
if (n_values == 0) {
GGML_ABORT("fatal error");
}
}
+// instantiate for external usage:
+template void llama_model_saver::add_kv<std::vector<uint32_t>>(const enum llm_kv, const std::vector<uint32_t> &, const bool);
void llama_model_saver::add_kv(const enum llm_kv key, const std::vector<std::string> & value) {
std::vector<const char *> tmp(value.size());
}
void llama_model_saver::add_kv_from_model() {
- const llama_hparams & hparams = model.hparams;
- const llama_vocab & vocab = model.vocab;
+ const llama_hparams & hparams = model->hparams;
+ const llama_vocab & vocab = model->vocab;
const int32_t n_vocab = vocab.n_tokens();
std::vector<std::string> tokens(n_vocab);
std::vector<float> scores(n_vocab);
std::vector<int32_t> token_types(n_vocab);
- for (int32_t id = 0; id < n_vocab; ++id) {
- const llama_vocab::token_data & token_data = vocab.get_token_data(id);
-
- tokens[id] = token_data.text;
- scores[id] = token_data.score;
-
- switch(token_data.attr) {
- case LLAMA_TOKEN_ATTR_UNKNOWN: token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN; break;
- case LLAMA_TOKEN_ATTR_UNUSED: token_types[id] = LLAMA_TOKEN_TYPE_UNUSED; break;
- case LLAMA_TOKEN_ATTR_NORMAL: token_types[id] = LLAMA_TOKEN_TYPE_NORMAL; break;
- case LLAMA_TOKEN_ATTR_CONTROL: token_types[id] = LLAMA_TOKEN_TYPE_CONTROL; break;
- case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break;
- case LLAMA_TOKEN_ATTR_BYTE: token_types[id] = LLAMA_TOKEN_TYPE_BYTE; break;
- case LLAMA_TOKEN_ATTR_UNDEFINED:
- default: token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED; break;
+ if (vocab.get_type() != LLAMA_VOCAB_TYPE_NONE) {
+ for (int32_t id = 0; id < n_vocab; ++id) {
+ const llama_vocab::token_data & token_data = vocab.get_token_data(id);
+
+ tokens[id] = token_data.text;
+ scores[id] = token_data.score;
+
+ switch(token_data.attr) {
+ case LLAMA_TOKEN_ATTR_UNKNOWN: token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN; break;
+ case LLAMA_TOKEN_ATTR_UNUSED: token_types[id] = LLAMA_TOKEN_TYPE_UNUSED; break;
+ case LLAMA_TOKEN_ATTR_NORMAL: token_types[id] = LLAMA_TOKEN_TYPE_NORMAL; break;
+ case LLAMA_TOKEN_ATTR_CONTROL: token_types[id] = LLAMA_TOKEN_TYPE_CONTROL; break;
+ case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break;
+ case LLAMA_TOKEN_ATTR_BYTE: token_types[id] = LLAMA_TOKEN_TYPE_BYTE; break;
+ case LLAMA_TOKEN_ATTR_UNDEFINED:
+ default: token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED; break;
+ }
}
}
// add_kv(LLM_KV_GENERAL_TYPE, ???);
- add_kv(LLM_KV_GENERAL_ARCHITECTURE, model.arch_name());
+ add_kv(LLM_KV_GENERAL_ARCHITECTURE, model->arch_name());
// add_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION, ???);
// add_kv(LLM_KV_GENERAL_ALIGNMENT, ???);
- add_kv(LLM_KV_GENERAL_NAME, model.name);
+ add_kv(LLM_KV_GENERAL_NAME, model->name);
// add_kv(LLM_KV_GENERAL_AUTHOR, ???);
// add_kv(LLM_KV_GENERAL_VERSION, ???);
// add_kv(LLM_KV_GENERAL_URL, ???);
}
void llama_model_saver::add_tensors_from_model() {
- if (std::string(model.output->name) != std::string(model.tok_embd->name)) {
- add_tensor(model.tok_embd); // some models use the same tensor for tok_embd and output
+ if (std::string(model->output->name) != std::string(model->tok_embd->name)) {
+ add_tensor(model->tok_embd); // some models use the same tensor for tok_embd and output
}
- add_tensor(model.type_embd);
- add_tensor(model.pos_embd);
- add_tensor(model.tok_norm);
- add_tensor(model.tok_norm_b);
- add_tensor(model.output_norm);
- add_tensor(model.output_norm_b);
- add_tensor(model.output);
- add_tensor(model.output_b);
- add_tensor(model.output_norm_enc);
- add_tensor(model.cls);
- add_tensor(model.cls_b);
- add_tensor(model.cls_out);
- add_tensor(model.cls_out_b);
- add_tensor(model.cls_norm);
-
- for (const struct llama_layer & layer : model.layers) {
+ add_tensor(model->type_embd);
+ add_tensor(model->pos_embd);
+ add_tensor(model->tok_norm);
+ add_tensor(model->tok_norm_b);
+ add_tensor(model->output_norm);
+ add_tensor(model->output_norm_b);
+ add_tensor(model->output);
+ add_tensor(model->output_b);
+ add_tensor(model->output_norm_enc);
+ add_tensor(model->cls);
+ add_tensor(model->cls_b);
+ add_tensor(model->cls_out);
+ add_tensor(model->cls_out_b);
+ add_tensor(model->cls_norm);
+
+ for (const struct llama_layer & layer : model->layers) {
for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
add_tensor(reinterpret_cast<const struct ggml_tensor * const *>(&layer)[i]);
}
#pragma once
+#include "gguf.h"
#include "llama.h"
#include "llama-arch.h"
struct llama_model_saver {
struct gguf_context * gguf_ctx = nullptr;
- const struct llama_model & model;
+ const bool gguf_ctx_owned;
+ const struct llama_model * model;
const struct LLM_KV llm_kv;
- llama_model_saver(const struct llama_model & model);
+ llama_model_saver(const struct llama_model * model);
+ llama_model_saver(enum llm_arch arch, struct gguf_context * gguf_ctx);
~llama_model_saver();
void add_kv(enum llm_kv key, uint32_t value);
#include "llama-model.h"
+#include "ggml.h"
#include "llama-impl.h"
#include "llama-mmap.h"
#include "llama-cparams.h"
#include <algorithm>
#include <cassert>
#include <cfloat>
+#include <cstdint>
#include <cstring>
#include <cmath>
#include <functional>
return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
}
-// checks if the weight tensor can be used with the specified buffer type and device
-static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
- GGML_ASSERT(w != nullptr);
-
- if (op == GGML_OP_NONE) {
- return true;
- }
-
- ggml_init_params params = {
- /*.mem_size =*/ ggml_tensor_overhead()*8,
- /*.mem_buffer =*/ NULL,
- /*.no_alloc =*/ true,
- };
- ggml_context_ptr ctx_ptr { ggml_init(params) };
- if (!ctx_ptr) {
- throw std::runtime_error(format("failed to create ggml context"));
- }
- ggml_context * ctx = ctx_ptr.get();
-
- ggml_tensor * op_tensor = nullptr;
-
- switch (op) {
- case GGML_OP_GET_ROWS:
- {
- ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
- op_tensor = ggml_get_rows(ctx, w, b);
- } break;
- case GGML_OP_MUL_MAT:
- {
- ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
- op_tensor = ggml_mul_mat(ctx, w, b);
- } break;
- case GGML_OP_MUL_MAT_ID:
- {
- int n_expert_used = hparams.n_expert_used;
- ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
- ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
- op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
- } break;
- case GGML_OP_ADD:
- {
- ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
- op_tensor = ggml_add(ctx, a, w);
- } break;
- case GGML_OP_ADD_ID:
- {
- int n_expert_used = hparams.n_expert_used;
- ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
- ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
- op_tensor = ggml_add_id(ctx, a, w, c);
- } break;
- case GGML_OP_MUL:
- {
- ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
- op_tensor = ggml_mul(ctx, a, w);
- } break;
- case GGML_OP_DIV:
- {
- ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
- op_tensor = ggml_div(ctx, a, w);
- } break;
- case GGML_OP_ROPE:
- {
- int n_embd_head = hparams.n_embd_head_v;
- int n_head = hparams.n_head();
- ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
- ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
- op_tensor = ggml_rope_ext(
- ctx, a, b, w,
- 0, 0, 0, 0, 0,
- 0, 0, 0, 0
- );
-
- } break;
- case GGML_OP_SSM_CONV:
- {
- const int64_t n_seq_tokens = 512;
- const int64_t n_seqs = 3;
- ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
- op_tensor = ggml_ssm_conv(ctx, conv_x, w);
- } break;
- case GGML_OP_SSM_SCAN:
- {
- // w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
- const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
- const int64_t n_head = w->ne[1];
- const int64_t head_dim = hparams.ssm_d_inner / n_head;
- const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
- const int64_t n_seq_tokens = 512;
- const int64_t n_seqs = 3;
- ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
- ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
- ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
- ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
- ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
- ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
- op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
- } break;
- case GGML_OP_RWKV_WKV6:
- {
- // FIXME
- const int64_t S = 123;
- const int64_t H = 123;
- const int64_t n_tokens = 123;
- const int64_t n_seqs = 123;
- ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
- ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
- ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
- ggml_tensor * tf = w;
- ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
- ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
- op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
- } break;
- case GGML_OP_IM2COL:
- {
- const int n_embd_inp = hparams.n_embd_inp();
- ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1);
- op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
- } break;
- case GGML_OP_SCALE:
- {
- op_tensor = ggml_scale(ctx, w, 1.0f);
- } break;
- default:
- GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
- }
-
- // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
- GGML_ASSERT(w->buffer == nullptr);
- w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
- bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
- ggml_backend_buffer_free(w->buffer);
- w->buffer = nullptr;
-
- return op_supported;
-}
-
-// lists of buffer types used for each layer
-using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
-
-// find the first buffer type in the list that can use the tensor
-static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t & buft_list) {
- GGML_ASSERT(!buft_list.empty());
- for (const auto & cur : buft_list) {
- ggml_backend_dev_t cur_dev = cur.first;
- ggml_backend_buffer_type_t cur_buft = cur.second;
- if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
- return cur_buft;
- }
- }
-
- return nullptr;
-}
-
// CPU: ACCEL -> GPU host -> CPU extra -> CPU
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts, bool no_host) {
buft_list_t buft_list;
}
void llama_model::load_hparams(llama_model_loader & ml) {
- const gguf_context * ctx = ml.meta.get();
+ const gguf_context * ctx = ml.metadata;
// get metadata as string
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
hparams.n_attn_temp_floor_scale = 8192;
hparams.f_attn_temp_scale = 0.1f;
hparams.f_attn_temp_offset = 1.0f;
- hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
+ uint32_t swa_period = 4; // pattern: 3 chunked - 1 full
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+ hparams.set_swa_pattern(swa_period);
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
case LLM_ARCH_AFMOE:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
// Pattern: 3 sliding - 1 full (global_attn_every_n_layers = 4)
if (hparams.n_swa > 0) {
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
- hparams.set_swa_pattern(4);
+ uint32_t swa_period = 4;
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+ hparams.set_swa_pattern(swa_period);
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
case LLM_ARCH_BERT:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
switch (hparams.n_layer) {
{
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
if (found_swa && hparams.n_swa > 0) {
- uint32_t swa_period = 3;
hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
-
- ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+ uint32_t swa_period = 3;
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
hparams.set_swa_pattern(swa_period, true);
} else {
}
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
switch (hparams.n_layer) {
case LLM_ARCH_JINA_BERT_V2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
hparams.f_max_alibi_bias = 8.0f;
case LLM_ARCH_JINA_BERT_V3:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
switch (hparams.n_layer) {
case LLM_ARCH_NOMIC_BERT_MOE:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
case LLM_ARCH_NEO_BERT:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
if (hparams.n_layer == 28) {
type = LLM_TYPE_250M;
case LLM_ARCH_EUROBERT:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
if (hparams.n_layer == 12) {
type = LLM_TYPE_SMALL; // 0.2B
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
- ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
+ ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false);
switch (hparams.n_layer) {
case 32: type = LLM_TYPE_7B; break;
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
if (found_swa && hparams.n_swa > 0) {
- uint32_t swa_period = 8;
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
- ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+ uint32_t swa_period = 8;
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
hparams.set_swa_pattern(swa_period);
} else {
{
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.n_swa = 4096; // default value of gemma 2
- hparams.set_swa_pattern(2);
+ uint32_t swa_period = 2;
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+ hparams.set_swa_pattern(swa_period);
hparams.attn_soft_cap = true;
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
if (found_swa && hparams.n_swa > 0) {
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
- hparams.set_swa_pattern(6);
+ uint32_t swa_period = 6;
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+ hparams.set_swa_pattern(swa_period);
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
} else {
} break;
case LLM_ARCH_GEMMA3N:
{
+ uint32_t swa_period = 5;
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
- hparams.set_swa_pattern(5);
+ hparams.set_swa_pattern(swa_period);
hparams.n_layer_kv_from_start = 20;
hparams.f_attention_scale = 1.0f;
case LLM_ARCH_GEMMA_EMBEDDING:
{
hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
- hparams.set_swa_pattern(6);
+ uint32_t swa_period = 6;
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+ hparams.set_swa_pattern(swa_period);
hparams.causal_attn = false; // embeddings do not use causal attention
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
//applied only if model converted with --sentence-transformers-dense-modules
ml.get_key(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in, false);
} break;
case LLM_ARCH_COMMAND_R:
{
- ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
+ ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, false);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
switch (hparams.n_layer) {
case 40: type = LLM_TYPE_35B; break;
case LLM_ARCH_COHERE2:
{
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
- hparams.set_swa_pattern(4);
+ uint32_t swa_period = 4;
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+ hparams.set_swa_pattern(swa_period);
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
if (found_swa && hparams.n_swa > 0) {
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
- hparams.set_swa_pattern(4);
+ uint32_t swa_period = 4;
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+ hparams.set_swa_pattern(swa_period);
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = 1.0; // See olmo2.cpp
case LLM_ARCH_DEEPSEEK:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
switch (hparams.n_ff_exp) {
case 1408: type = LLM_TYPE_16B; break;
const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26 || (hparams.n_layer == 48 && n_vocab == 128256));
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
if (!is_lite) {
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
}
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
// Expert gating function (GLM-4.5 uses sigmoid)
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
// deepseek MLA parameters
case LLM_ARCH_JAIS:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
- ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
+ ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false);
switch (hparams.n_layer) {
case 24: type = LLM_TYPE_1_3B; break;
if (hparams.n_layer == 64) { // 32B
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.n_swa = 4096;
- hparams.set_swa_pattern(4);
+ uint32_t swa_period = 4;
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+ hparams.set_swa_pattern(swa_period);
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
{
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.n_swa = 128;
- hparams.set_swa_pattern(4);
+ uint32_t swa_period = 4;
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+ hparams.set_swa_pattern(swa_period);
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
- ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
- ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
- ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
+ ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, false);
+ ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, false);
+ ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale, false);
// Granite uses rope_finetuned as a switch for rope, so default to true
bool rope_finetuned = true;
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
- ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
+ ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm, false);
switch (hparams.n_layer) {
case 32: type = LLM_TYPE_7B; break;
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
- ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
} break;
case LLM_ARCH_BAILINGMOE:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
switch (hparams.n_layer) {
case LLM_ARCH_BAILINGMOE2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
- ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
case LLM_ARCH_DOTS1:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
switch (hparams.n_layer) {
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
}
switch (hparams.n_layer) {
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
- ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
switch (hparams.n_layer) {
case 32: type = LLM_TYPE_A13B; break;
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
- hparams.set_swa_pattern(2);
+ uint32_t swa_period = 2;
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+ hparams.set_swa_pattern(swa_period);
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
{
ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
if (found_swa && hparams.n_swa > 0) {
- hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
- hparams.n_swa = 4096;
- hparams.set_swa_pattern(4, true);
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+ hparams.n_swa = 4096;
+ uint32_t swa_period = 4;
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+ hparams.set_swa_pattern(swa_period, true);
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
case LLM_ARCH_GROVEMOE:
{
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
- ml.get_key(LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, hparams.n_ff_chexp);
+ ml.get_key(LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, hparams.n_ff_chexp, false);
ml.get_key(LLM_KV_EXPERT_GROUP_SCALE, hparams.expert_group_scale);
ml.get_key(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
- ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
switch (hparams.n_layer) {
// MoE parameters - Kimi uses moe_intermediate_size = 1024
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
- ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
- ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
switch (hparams.n_layer) {
}
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
- ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp, hparams.n_layer, false);
ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer, false);
default: type = LLM_TYPE_UNKNOWN;
}
} break;
- default: throw std::runtime_error("unsupported model architecture");
+ default: throw std::runtime_error("unsupported model architecture: " + arch_name());
}
pimpl->n_bytes = ml.n_bytes;
// assign the output layer
pimpl->dev_output = get_layer_buft_list(n_layer);
- // one ggml context per buffer type
- int max_n_tensors = ml.n_tensors;
- max_n_tensors += 1; // duplicated output tensor
- max_n_tensors += n_layer*2; // duplicated rope freq tensors
- const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
-
- // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
- struct ggml_backend_buft_comparator {
- bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
- return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
- }
- };
- std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
-
- auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
- auto it = ctx_map.find(buft);
- if (it == ctx_map.end()) {
- ggml_init_params params = {
- /*.mem_size =*/ ctx_size,
- /*.mem_buffer =*/ NULL,
- /*.no_alloc =*/ true,
- };
-
- ggml_context * ctx = ggml_init(params);
- if (!ctx) {
- throw std::runtime_error(format("failed to create ggml context"));
- }
-
- ctx_map.emplace(buft, ctx);
-
- return ctx;
- }
- return it->second.get();
- };
-
- const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
- const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
- const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
+ const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
+ const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
+ const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
+ const auto TENSOR_SKIP_IF_VIRTUAL = llama_model_loader::TENSOR_SKIP_IF_VIRTUAL;
// create tensors for the weights
{
throw std::runtime_error("model has expert layers but no expert layers are used");
}
- int n_moved_tensors = 0;
- ggml_tensor * first_moved_tensor = nullptr;
- ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
- ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
-
auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
- ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
-
- if (!t_meta) {
- if (flags & TENSOR_NOT_REQUIRED) {
- return nullptr;
- }
- throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
- }
-
- // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
- // the tensor is duplicated
- // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
- llm_tensor tn_tensor = tn.tensor;
- if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & TENSOR_DUPLICATED) {
- tn_tensor = LLM_TENSOR_OUTPUT;
- }
-
- llm_tensor_info info;
- try {
- info = llm_tensor_info_for(tn_tensor);
- } catch (const std::out_of_range & e) {
- throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
- }
-
- // skip unused tensors
- if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) {
- const size_t nbytes = ggml_nbytes(t_meta);
- LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
-
- ml.size_data -= nbytes;
- ml.n_created++;
-
- return nullptr;
- }
-
- // tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
- ggml_op op;
- bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
- if (bias) {
- if (info.op == GGML_OP_MUL_MAT_ID) {
- op = GGML_OP_ADD_ID;
- } else {
- op = GGML_OP_ADD;
- }
- } else {
- op = info.op;
- }
-
- // sanity checks
- if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
- if (tn.bid != -1) {
- GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
- }
- } else {
- if (tn.bid == -1) {
- GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
- }
- }
-
- // select the buffer type for this tensor
- buft_list_t * buft_list;
- switch (info.layer) {
- case LLM_TENSOR_LAYER_INPUT:
- buft_list = pimpl->dev_input.buft_list;
- break;
- case LLM_TENSOR_LAYER_OUTPUT:
- buft_list = pimpl->dev_output.buft_list;
- break;
- case LLM_TENSOR_LAYER_REPEATING:
- buft_list = pimpl->dev_layer.at(tn.bid).buft_list;
- break;
- default:
- GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
- }
-
- ggml_backend_buffer_type_t buft = nullptr;
-
- // check overrides
- if (ml.tensor_buft_overrides) {
- std::string tensor_name = tn.str();
- for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
- std::regex pattern(overrides->pattern);
- if (std::regex_search(tensor_name, pattern)) {
- if (overrides->buft == ggml_backend_cpu_buffer_type()) {
- // when overriding to a CPU buffer, consider the extra buffer types
- buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
- } else {
- buft = overrides->buft;
- }
-
- LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
- tensor_name.c_str(),
- ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
- ggml_backend_buft_name(buft));
- break;
- }
- }
- }
-
- if (!buft) {
- buft = select_weight_buft(hparams, t_meta, op, *buft_list);
- if (!buft) {
- throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
- }
- }
-
- // avoid using a host buffer when using mmap
- auto * buft_dev = ggml_backend_buft_get_device(buft);
- if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
- auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
- if (!cpu_dev) {
- throw std::runtime_error("no CPU backend found");
- }
- buft = ggml_backend_dev_buffer_type(cpu_dev);
- }
-
- if (buft != buft_list->front().second) {
- n_moved_tensors++;
- if (!first_moved_tensor) {
- first_moved_tensor = t_meta;
- first_moved_from_buft = buft_list->front().second;
- first_moved_to_buft = buft;
- }
- }
-
- ggml_context * ctx = ctx_for_buft(buft);
-
- // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
- if (flags & TENSOR_DUPLICATED) {
- ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
- if (t) {
- return t;
- }
- }
- return ml.create_tensor(ctx, tn, ne, flags);
+ const buft_list_t * buft_list_layer = tn.bid == -1 ? nullptr : pimpl->dev_layer.at(tn.bid).buft_list;
+ return ml.create_tensor(
+ hparams, &pimpl->cpu_buft_list, pimpl->dev_input.buft_list, pimpl->dev_output.buft_list, buft_list_layer,
+ tn, ne, flags);
};
layers.resize(n_layer);
} break;
case LLM_ARCH_LLAMA4:
{
+ if (n_expert == 0) {
+ throw std::runtime_error(arch_name() + " model cannot have zero experts");
+ }
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
// output
}
for (int i = 0; i < n_layer; ++i) {
- bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
+ const bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
auto & layer = layers[i];
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
if (is_moe_layer) {
- int n_ff_exp = hparams.n_ff_exp;
+ const int64_t n_ff_exp = hparams.n_ff_exp;
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
case LLM_ARCH_GROK:
{
if (n_expert == 0) {
- throw std::runtime_error("Grok model cannot have zero experts");
+ throw std::runtime_error(arch_name() + " model cannot have zero experts");
}
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
case LLM_ARCH_NOMIC_BERT_MOE:
case LLM_ARCH_JINA_BERT_V3:
{
+ if (n_token_types == 0) {
+ throw std::runtime_error(arch_name() + " model needs to define token type count");
+ }
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
- layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
- layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ // FIXME test-llama-archs crashes if q_norm is created
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
+ layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
const int64_t n_embd_head_qk_rope = hparams.n_rot;
const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
+ GGML_ASSERT(n_embd_head_qk_nope >= 1);
const int64_t q_lora_rank = hparams.n_lora_q;
const int64_t kv_lora_rank = hparams.n_lora_kv;
layer.attn_norm_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0);
// this tensor seems to be unused in HF transformers implementation
- layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
+ layer.attn_rel_b_cross = create_tensor(
+ tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
const int64_t n_ff_exp = hparams.n_ff_exp;
const int64_t n_expert = hparams.n_expert;
const int64_t n_expert_used = hparams.n_expert_used;
- const int64_t n_ff_shexp = hparams.n_ff_shexp;
+ const int64_t n_ff_shexp = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : n_ff_exp;
const int64_t head_dim = hparams.n_embd_head_k;
const int64_t n_qo_dim = n_head * head_dim;
const int64_t n_kv_dim = n_head_kv * head_dim;
for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i];
+ const uint32_t n_ff_shexp = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : hparams.n_ff(i);
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
- layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
- layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
- layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
}
} break;
case LLM_ARCH_HUNYUAN_DENSE:
const int64_t n_embd_head_v_kda = hparams.n_embd_head_kda;
const int64_t ssm_d_conv = hparams.ssm_d_conv;
- // Try loading KDA specific tensors (using SSM_ prefix)
- // Conv1d weights: try 4D first, then 3D (quantization may remove trailing 1)
- // 4D: [d_conv, 1, d_inner, 1], 3D: [d_conv, 1, d_inner]
- layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
- if (!layer.ssm_q_conv) {
- layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, TENSOR_NOT_REQUIRED);
- }
+ if (hparams.is_recurrent(i)) {
+ // Conv1d weights: try 4D first, then 3D (quantization may remove trailing 1)
+ // 4D: [d_conv, 1, d_inner, 1], 3D: [d_conv, 1, d_inner]
+ layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
+ if (!layer.ssm_q_conv) {
+ layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, 0);
+ }
- if (layer.ssm_q_conv) {
// KDA Layer - Conv1d weights may be 3D or 4D
layer.ssm_k_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
if (!layer.ssm_k_conv) {
const int64_t qk_rope_head_dim = hparams.n_rot; // From config: qk_rope_head_dim
layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + qk_rope_head_dim}, 0);
// Support Legacy GGUFs that don't split wkv_b (MLA KV cache disabled)
- layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_k_mla - qk_rope_head_dim + n_embd_head_v_mla)}, TENSOR_NOT_REQUIRED);
+ layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i),
+ {kv_lora_rank, n_head * (n_embd_head_k_mla - qk_rope_head_dim + n_embd_head_v_mla)}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
if (!layer.wkv_b) { // MLA KV cache enabled
layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_k_mla - qk_rope_head_dim, kv_lora_rank, n_head}, 0);
layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
} break;
case LLM_ARCH_QWEN3NEXT:
{
+ if (n_expert == 0) {
+ throw std::runtime_error(arch_name() + " model cannot have zero experts");
+ }
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
// output
for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i];
+ const uint32_t n_ff_shexp = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : hparams.n_ff(i);
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
// Shared experts
layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
- layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
- layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
- layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0);
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0);
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, 0);
}
} break;
case LLM_ARCH_QWEN35MOE:
default:
throw std::runtime_error("unknown architecture");
}
-
- if (n_moved_tensors > 0) {
- LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
- __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
- ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
- }
}
ml.done_getting_tensors();
// create the backend buffers
std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_buf_maps;
- ctx_buf_maps.reserve(ctx_map.size());
+ ctx_buf_maps.reserve(ml.ctx_map.size());
// Ensure we have enough capacity for the maximum backend buffer we will potentially create
- const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
+ const size_t n_max_backend_buffer = ml.ctx_map.size() * ml.files.size();
pimpl->ctxs_bufs.reserve(n_max_backend_buffer);
- for (auto & [buft, ctx_ptr] : ctx_map) {
+ for (auto & [buft, ctx_ptr] : ml.ctx_map) {
ggml_context * ctx = ctx_ptr.get();
// skip contexts without tensors
}
std::vector<std::string> splits = {};
- llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
+ llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr,
+ fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
ml.init_mappings(false); // no prefetching
llama_model model(llama_model_default_params());
}
// copy the KV pairs from the input file
- gguf_set_kv (ctx_out.get(), ml.meta.get());
+ gguf_set_kv (ctx_out.get(), ml.metadata);
gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
gguf_set_val_u32(ctx_out.get(), "general.file_type", ftype); // TODO: use LLM_KV
};
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
- struct gguf_context * ctx = ml.meta.get();
+ struct gguf_context * ctx = ml.metadata;
// determine vocab type
{
#include "llama.h"
+#include "ggml-cpp.h"
#include "llama-impl.h"
#include "llama-chat.h"
#include "ggml.h"
#include "ggml-backend.h"
+#include "gguf.h"
#include <algorithm>
#include <cassert>
}
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
-static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
+static int llama_model_load(struct gguf_context * metadata, llama_model_set_tensor_data_t set_tensor_data, void * set_tensor_data_ud,
+ const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
// loading time will be recalculated after the first eval, so
// we take page faults deferred by mmap() into consideration
model.t_load_us = 0;
model.t_start_us = tm.t_start_us;
try {
- llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
+ llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, params.use_mmap, params.use_direct_io,
+ params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
ml.print_info();
}
static struct llama_model * llama_model_load_from_file_impl(
+ struct gguf_context * metadata,
+ llama_model_set_tensor_data_t set_tensor_data,
+ void * set_tensor_data_ud,
const std::string & path_model,
std::vector<std::string> & splits,
struct llama_model_params params) {
+ GGML_ASSERT((metadata == nullptr) != path_model.empty() && "exactly one out of metadata and path_model needs to be defined");
ggml_time_init();
if (!params.vocab_only && ggml_backend_reg_count() == 0) {
props.memory_free/1024/1024);
}
- const int status = llama_model_load(path_model, splits, *model, params);
+ const int status = llama_model_load(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, *model, params);
GGML_ASSERT(status <= 0);
if (status < 0) {
if (status == -1) {
return model;
}
+struct llama_model * llama_model_init_from_user(
+ struct gguf_context * metadata,
+ llama_model_set_tensor_data_t set_tensor_data,
+ void * set_tensor_data_ud,
+ struct llama_model_params params) {
+ GGML_ASSERT(metadata != nullptr);
+ std::string path_model;
+ std::vector<std::string> splits = {};
+ params.use_mmap = false;
+ params.use_extra_bufts = false;
+ return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, params);
+}
// deprecated
struct llama_model * llama_load_model_from_file(
const char * path_model,
const char * path_model,
struct llama_model_params params) {
std::vector<std::string> splits = {};
- return llama_model_load_from_file_impl(path_model, splits, params);
+ return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, path_model, splits, params);
}
struct llama_model * llama_model_load_from_splits(
for (size_t i = 0; i < n_paths; ++i) {
splits.push_back(paths[i]);
}
- return llama_model_load_from_file_impl(splits.front(), splits, params);
+ return llama_model_load_from_file_impl(nullptr, nullptr, nullptr, splits.front(), splits, params);
}
void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
- llama_model_saver ms(*model);
+ llama_model_saver ms(model);
ms.add_kv_from_model();
ms.add_tensors_from_model();
ms.save(path_model);
);
break;
case LLM_TYPE_13B:
+ case LLM_TYPE_UNKNOWN:
break;
default:
GGML_ABORT("fatal error");
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
- true, hparams.expert_weights_scale,
+ hparams.expert_weights_scale, hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(moe_out, "ffn_moe_out", il);
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
- true, hparams.expert_weights_scale,
+ hparams.expert_weights_scale, hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(moe_out, "ffn_moe_out", il);
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
- true, hparams.expert_weights_scale,
+ hparams.expert_weights_scale, hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(moe_out, "ffn_moe_out", il);
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
- true, hparams.expert_weights_scale,
+ hparams.expert_weights_scale, hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(routed_out, "ffn_moe_out", il);
ggml_build_forward_expand(gf, cur);
- // Check layer type by checking which tensors exist
- // KDA layers have ssm_a_log tensor, MLA layers have wkv_a_mqa tensor
- bool is_kda = (layer.ssm_a != nullptr);
- bool is_mla = (layer.wkv_a_mqa != nullptr);
-
- if (is_kda) {
+ if (hparams.is_recurrent(il)) {
// === KDA Layer (Kimi Delta Attention) with Recurrent State ===
// Reference: vLLM kda.py
const auto * mctx_cur = inp_rs->mctx;
cur = ggml_mul_mat(ctx0, layer.wo, gated);
cb(cur, "kda_out", il);
- } else if (is_mla) {
+ } else {
// === MLA Layer (Multi-head Latent Attention) without KV Cache ===
// Reference: vLLM mla.py
// Step 1: Q projection and reshape
cur = build_attn(inp_attn_kv, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
cb(cur, "mla_out", il);
}
- } else {
- // Unknown layer type - this should not happen
- GGML_ABORT("Kimi layer is neither KDA nor MLA - missing required tensors");
}
// On last layer, select only the output tokens
hparams.n_expert,
hparams.n_expert_used,
LLM_FFN_SILU, true,
- true, hparams.expert_weights_scale,
+ hparams.expert_weights_scale, hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(moe_out, "ffn_moe_out", il);
GGML_ASSERT(n_seqs != 0);
GGML_ASSERT(ubatch.equal_seqs());
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+ GGML_ASSERT(d_inner % n_head == 0);
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
const auto kv_head = mctx_cur->get_head();
+ const int64_t n_embd = hparams.n_embd;
const int64_t d_conv = hparams.ssm_d_conv;
const int64_t d_inner = hparams.ssm_d_inner;
const int64_t d_state = hparams.ssm_d_state;
GGML_ASSERT(n_seqs != 0);
GGML_ASSERT(ubatch.equal_seqs());
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+ GGML_ASSERT(d_inner % n_head == 0);
+ GGML_ASSERT(d_inner % (n_group*n_embd) == 0);
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_RELU_SQR, hparams.expert_weights_norm,
- true, hparams.expert_weights_scale,
+ hparams.expert_weights_scale, hparams.expert_weights_scale,
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
il);
cb(moe_out, "ffn_moe_out", il);
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
// check if this layer is Mamba or Attention
- bool is_mamba_layer = hparams.is_recurrent(il);
+ const bool is_mamba_layer = hparams.is_recurrent(il);
if (is_mamba_layer) {
// PLaMo-2 Mamba layer
GGML_ASSERT(n_seqs != 0);
GGML_ASSERT(ubatch.equal_seqs());
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+ GGML_ASSERT(d_inner % n_head == 0);
+ GGML_ASSERT(n_group == 0);
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
#llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${PROJECT_SOURCE_DIR}/models/ggml-vocab-baichuan.gguf)
# llama_build_and_test(test-double-float.cpp) # SLOW
+
+ llama_build_and_test(test-llama-archs.cpp)
endif()
llama_build_and_test(test-chat-peg-parser.cpp peg-parser/simple-tokenize.cpp)
--- /dev/null
+#include "common.h"
+#include "log.h"
+#include "ggml-backend.h"
+#include "ggml.h"
+#include "gguf.h"
+#include "ggml-cpp.h"
+#include "llama.h"
+#include "llama-cpp.h"
+#include "../src/llama-arch.h"
+#include "../src/llama-model-saver.h"
+
+#include <cinttypes>
+#include <cstdio>
+#include <cstring>
+#include <cstdint>
+#include <random>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+// normalized mean squared error = mse(a, b) / mse(a, 0)
+static double nmse(const std::vector<float> & a, const std::vector<float> & b) {
+ GGML_ASSERT(a.size() == b.size());
+ double mse_a_b = 0.0;
+ double mse_a_0 = 0.0;
+
+ for (size_t i = 0; i < a.size(); i++) {
+ float a_i = a[i];
+ float b_i = b[i];
+
+ mse_a_b += (a_i - b_i) * (a_i - b_i);
+ mse_a_0 += a_i * a_i;
+ }
+
+ return mse_a_b / mse_a_0;
+}
+
+static void set_tensor_data(struct ggml_tensor * tensor, void * userdata) {
+ std::hash<std::string> hasher;
+ std::mt19937 gen(hasher(tensor->name) + *(const size_t *) userdata);
+ std::normal_distribution<float> dis(0.0f, 1.0e-2f);
+
+ const int64_t ne = ggml_nelements(tensor);
+ if (tensor->type == GGML_TYPE_F32) {
+ std::vector<float> tmp(ne);
+ for (int64_t i = 0; i < ne; i++) {
+ tmp[i] = dis(gen);
+ }
+ ggml_backend_tensor_set(tensor, tmp.data(), 0, ggml_nbytes(tensor));
+ } else if (tensor->type == GGML_TYPE_F16) {
+ std::vector<ggml_fp16_t> tmp(ne);
+ for (int64_t i = 0; i < ne; i++) {
+ tmp[i] = ggml_fp32_to_fp16(dis(gen));
+ }
+ ggml_backend_tensor_set(tensor, tmp.data(), 0, ggml_nbytes(tensor));
+ } else {
+ GGML_ABORT("fatal error");
+ }
+}
+
+static void usage(char ** argv) {
+ printf("Usage: %s [-a/--arch arch] [-s/--seed seed] [-v/--verbose]\n", argv[0]);
+}
+
+static std::vector<llama_token> get_tokens(const uint32_t n_tokens, const uint32_t n_vocab, const size_t seed){
+ std::mt19937 gen(seed);
+ std::uniform_int_distribution<> dis(0, n_vocab - 1);
+ std::vector<llama_token> ret;
+ ret.reserve(n_tokens);
+ for (uint32_t i = 0; i < n_tokens; i++) {
+ ret.push_back(dis(gen));
+ }
+ return ret;
+}
+
+static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) {
+ gguf_context_ptr ret(gguf_init_empty());
+ llama_model_saver ms(arch, ret.get());
+ const uint32_t n_ctx = 128;
+
+ uint32_t n_vocab = 128;
+ uint32_t n_embd = 256;
+ uint32_t n_head = 2;
+ uint32_t n_ff = 384;
+ uint32_t n_layer = 2;
+ if (arch == LLM_ARCH_LLAMA4) {
+ n_layer = 4; // hparams.n_no_rope_layer_step is hard-coded to 4
+ } else if (arch == LLM_ARCH_GEMMA3N) {
+ n_embd = 64;
+ n_head = 1;
+ n_ff = 96;
+ } else if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_KIMI_LINEAR) {
+ n_embd = 128;
+ n_head = 1;
+ n_ff = 192;
+ } else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
+ n_layer = 3;
+ } else if (arch == LLM_ARCH_CHAMELEON) {
+ n_vocab = 10240;
+ } else if (arch == LLM_ARCH_GEMMA3N) {
+ n_layer = 22; // hparams.n_layer_kv_from_start = 20 is hardcoded
+ }
+
+ const uint32_t n_embd_head = n_embd / n_head;
+
+ ms.add_kv(LLM_KV_GENERAL_ARCHITECTURE, llm_arch_name(arch));
+ ms.add_kv(LLM_KV_VOCAB_SIZE, n_vocab);
+ ms.add_kv(LLM_KV_CONTEXT_LENGTH, n_ctx);
+ ms.add_kv(LLM_KV_EMBEDDING_LENGTH, n_embd);
+ ms.add_kv(LLM_KV_FEATURES_LENGTH, n_embd);
+ ms.add_kv(LLM_KV_BLOCK_COUNT, n_layer);
+ ms.add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, uint32_t(1));
+
+ if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
+ std::vector<uint32_t> n_ff_per_layer;
+ n_ff_per_layer.reserve(n_layer);
+ for (uint32_t il = 0; il < n_layer; il++) {
+ n_ff_per_layer.push_back(il <= 1 ? 0 : n_ff);
+ }
+ ms.add_kv(LLM_KV_FEED_FORWARD_LENGTH, n_ff_per_layer);
+ } else {
+ ms.add_kv(LLM_KV_FEED_FORWARD_LENGTH, n_ff);
+ }
+
+ ms.add_kv(LLM_KV_USE_PARALLEL_RESIDUAL, false);
+ ms.add_kv(LLM_KV_LOGIT_SCALE, 1.0f);
+ ms.add_kv(LLM_KV_TIME_MIX_EXTRA_DIM, uint32_t(64));
+ ms.add_kv(LLM_KV_TIME_DECAY_EXTRA_DIM, uint32_t(128));
+ ms.add_kv(LLM_KV_FULL_ATTENTION_INTERVAL, uint32_t(2));
+
+ if (arch == LLM_ARCH_PLAMO2 || arch == LLM_ARCH_JAMBA || arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE ||
+ arch == LLM_ARCH_GRANITE_HYBRID || arch == LLM_ARCH_LFM2 || arch == LLM_ARCH_LFM2MOE || arch == LLM_ARCH_KIMI_LINEAR) {
+ GGML_ASSERT(n_layer >= 2);
+ std::vector<uint32_t> n_head_per_layer;
+ n_head_per_layer.reserve(n_layer);
+ for (uint32_t il = 0; il < n_layer; il++) {
+ n_head_per_layer.push_back(il == 1 ? 0 : n_head);
+ }
+ ms.add_kv(LLM_KV_ATTENTION_HEAD_COUNT, n_head_per_layer);
+ ms.add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, n_head_per_layer);
+ } else {
+ ms.add_kv(LLM_KV_ATTENTION_HEAD_COUNT, n_head);
+ ms.add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, n_head);
+ }
+
+ ms.add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, 8.0f);
+ if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_KIMI_LINEAR) {
+ ms.add_kv(LLM_KV_ATTENTION_KEY_LENGTH, uint32_t(576));
+ ms.add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, uint32_t(512));
+ ms.add_kv(LLM_KV_ROPE_DIMENSION_COUNT, uint32_t(64));
+ ms.add_kv(LLM_KV_ATTENTION_KEY_LENGTH_MLA, uint32_t(192));
+ ms.add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, uint32_t(128));
+ }
+ ms.add_kv(LLM_KV_ATTENTION_CLAMP_KQV, 1.0f);
+ ms.add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS, 1e-5f);
+ ms.add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
+ ms.add_kv(LLM_KV_ATTENTION_GROUPNORM_EPS, 1e-5f);
+ ms.add_kv(LLM_KV_ATTENTION_GROUPNORM_GROUPS, uint32_t(8));
+ ms.add_kv(LLM_KV_ATTENTION_Q_LORA_RANK, uint32_t(512));
+ ms.add_kv(LLM_KV_ATTENTION_KV_LORA_RANK, uint32_t(512));
+ ms.add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, uint32_t(8));
+ ms.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, n_ctx/8);
+
+ if (arch == LLM_ARCH_MIMO2 || arch == LLM_ARCH_STEP35) {
+ std::vector<uint32_t> pattern;
+ pattern.reserve(n_layer);
+ for (uint32_t il = 0; il < n_layer; il++) {
+ pattern.push_back(il % 2);
+ }
+ ms.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, pattern);
+ } else {
+ ms.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, uint32_t(2));
+ }
+
+ ms.add_kv(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, uint32_t(1));
+ ms.add_kv(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, uint32_t(64));
+ ms.add_kv(LLM_KV_ATTENTION_INDEXER_TOP_K, uint32_t(8));
+ ms.add_kv(LLM_KV_ROPE_DIMENSION_SECTIONS, std::vector<uint32_t>({n_embd_head/4, n_embd_head/4, n_embd_head/4, n_embd_head/4}));
+ ms.add_kv(LLM_KV_TOKENIZER_MODEL, "no_vocab");
+ // ms.add_kv(LLM_KV_DENSE_2_FEAT_OUT, n_embd);
+ // ms.add_kv(LLM_KV_DENSE_3_FEAT_IN, n_embd);
+
+ if (moe) {
+ ms.add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, n_ff);
+ ms.add_kv(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, uint32_t(2));
+ ms.add_kv(LLM_KV_EXPERT_COUNT, uint32_t(2));
+ ms.add_kv(LLM_KV_EXPERT_USED_COUNT, uint32_t(1));
+ ms.add_kv(LLM_KV_EXPERT_SHARED_COUNT, uint32_t(1));
+ ms.add_kv(LLM_KV_EXPERT_GATING_FUNC, uint32_t(2)); // sigmoid
+ ms.add_kv(LLM_KV_EXPERT_GROUP_SCALE, 1.0f);
+ ms.add_kv(LLM_KV_EXPERTS_PER_GROUP, uint32_t(1));
+ }
+
+ ms.add_kv(LLM_KV_POSNET_EMBEDDING_LENGTH, n_embd);
+ ms.add_kv(LLM_KV_POSNET_BLOCK_COUNT, n_layer);
+ ms.add_kv(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, n_embd);
+ ms.add_kv(LLM_KV_CONVNEXT_BLOCK_COUNT, n_layer);
+ ms.add_kv(LLM_KV_XIELU_ALPHA_N, 1.0f);
+ ms.add_kv(LLM_KV_XIELU_ALPHA_P, 1.0f);
+ ms.add_kv(LLM_KV_XIELU_BETA, 1.0f);
+ ms.add_kv(LLM_KV_XIELU_EPS, 1.0e-7f);
+ ms.add_kv(LLM_KV_SSM_INNER_SIZE, arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE ? 64 : 2*n_embd);
+ ms.add_kv(LLM_KV_SSM_CONV_KERNEL, uint32_t(4));
+ ms.add_kv(LLM_KV_SSM_STATE_SIZE, uint32_t(32));
+ ms.add_kv(LLM_KV_SSM_TIME_STEP_RANK, n_head);
+ ms.add_kv(LLM_KV_SSM_GROUP_COUNT, arch == LLM_ARCH_PLAMO2 ? 0 : uint32_t(2));
+ ms.add_kv(LLM_KV_KDA_HEAD_DIM, uint32_t(128));
+ ms.add_kv(LLM_KV_WKV_HEAD_SIZE, n_embd/n_head);
+ ms.add_kv(LLM_KV_SHORTCONV_L_CACHE, uint32_t(3));
+
+ for (uint32_t il = 0; il < n_layer; il++) {
+ ggml_tensor t;
+ memset(&t, 0, sizeof(ggml_tensor));
+ t.type = GGML_TYPE_F16;
+ ggml_format_name(&t, "conv%" PRIu32 "d.weight", il);
+ gguf_add_tensor(ms.gguf_ctx, &t);
+ ggml_format_name(&t, "posnet.%" PRIu32 ".conv1.weight", il);
+ gguf_add_tensor(ms.gguf_ctx, &t);
+ ggml_format_name(&t, "posnet.%" PRIu32 ".conv2.weight", il);
+ gguf_add_tensor(ms.gguf_ctx, &t);
+ ggml_format_name(&t, "convnext.%" PRIu32 ".dw.weight", il);
+ gguf_add_tensor(ms.gguf_ctx, &t);
+ }
+ return ret;
+}
+
+static std::pair<llama_model_ptr, llama_context_ptr> get_model_and_ctx(
+ struct gguf_context * gguf_ctx, const size_t seed, const std::vector<ggml_backend_dev_t> & devs) {
+ llama_model_params model_params = llama_model_default_params();
+ std::vector<ggml_backend_dev_t> devs_copy = devs;
+ devs_copy.push_back(nullptr);
+ model_params.devices = devs_copy.data();
+
+ llama_context_params ctx_params = llama_context_default_params();
+ ctx_params.n_ctx = 0;
+ ctx_params.n_threads = 4;
+ ctx_params.n_threads_batch = 4;
+
+ size_t tmp = seed;
+ llama_model_ptr model(llama_model_init_from_user(gguf_ctx, set_tensor_data, &tmp, model_params));
+ if (!model) {
+ throw std::runtime_error("failed to create llama model");
+ }
+ llama_context_ptr lctx(llama_init_from_model(model.get(), ctx_params));
+ if (!lctx) {
+ throw std::runtime_error("failed to create llama context");
+ }
+ return std::make_pair(std::move(model), std::move(lctx));
+}
+
+static std::vector<float> get_logits(
+ llama_model * model, llama_context * lctx, const std::vector<llama_token> & tokens, bool encode = false) {
+ const uint32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
+ const uint32_t n_ctx = llama_n_ctx(lctx);
+ const uint32_t n_tokens = tokens.size();
+ llama_batch batch = llama_batch_init(n_ctx, 0, 1);
+ GGML_ASSERT(n_tokens <= n_ctx);
+ for (uint32_t pos = 0; pos < n_tokens; pos++) {
+ common_batch_add(batch, tokens[pos], pos, {0}, true);
+ }
+ batch.n_tokens = n_tokens;
+ if (encode) {
+ if (llama_encode(lctx, batch)) {
+ llama_batch_free(batch);
+ throw std::runtime_error("failed to encode batch");
+ }
+ }
+ if (llama_decode(lctx, batch)) {
+ llama_batch_free(batch);
+ throw std::runtime_error("failed to decode batch");
+ }
+
+ std::vector<float> ret;
+ ret.reserve(n_tokens*n_vocab);
+ for (uint32_t i = 0; i < n_tokens; i++) {
+ const float * logits_ith = llama_get_logits_ith(lctx, i);
+ for (uint32_t j = 0; j < n_vocab; j++) {
+ ret.push_back(logits_ith[j]);
+ }
+ }
+ llama_batch_free(batch);
+ return ret;
+}
+
+static bool moe_mandatory(const llm_arch arch) {
+ switch (arch) {
+ case LLM_ARCH_LLAMA4:
+ case LLM_ARCH_GROK:
+ case LLM_ARCH_QWEN2MOE:
+ case LLM_ARCH_QWEN3MOE:
+ case LLM_ARCH_QWEN3NEXT:
+ case LLM_ARCH_QWEN3VLMOE:
+ case LLM_ARCH_QWEN35MOE:
+ case LLM_ARCH_PHIMOE:
+ case LLM_ARCH_DBRX:
+ case LLM_ARCH_OLMOE:
+ case LLM_ARCH_ARCTIC:
+ case LLM_ARCH_DEEPSEEK:
+ case LLM_ARCH_DEEPSEEK2:
+ case LLM_ARCH_GLM4_MOE:
+ case LLM_ARCH_GLM_DSA:
+ case LLM_ARCH_EXAONE_MOE:
+ case LLM_ARCH_BAILINGMOE:
+ case LLM_ARCH_BAILINGMOE2:
+ case LLM_ARCH_DOTS1:
+ case LLM_ARCH_AFMOE:
+ case LLM_ARCH_ERNIE4_5:
+ case LLM_ARCH_ERNIE4_5_MOE:
+ case LLM_ARCH_HUNYUAN_MOE:
+ case LLM_ARCH_OPENAI_MOE:
+ case LLM_ARCH_LFM2MOE:
+ case LLM_ARCH_SMALLTHINKER:
+ case LLM_ARCH_LLADA_MOE:
+ case LLM_ARCH_GROVEMOE:
+ case LLM_ARCH_MINIMAX_M2:
+ case LLM_ARCH_RND1:
+ case LLM_ARCH_PADDLEOCR:
+ case LLM_ARCH_MIMO2:
+ case LLM_ARCH_KIMI_LINEAR:
+ case LLM_ARCH_STEP35:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool moe_implemented(const llm_arch arch) {
+ if (moe_mandatory(arch)) {
+ return true;
+ }
+ switch (arch) {
+ case LLM_ARCH_LLAMA:
+ case LLM_ARCH_REFACT:
+ case LLM_ARCH_MINICPM:
+ case LLM_ARCH_GRANITE:
+ case LLM_ARCH_GRANITE_MOE:
+ case LLM_ARCH_MISTRAL3:
+ case LLM_ARCH_LLAMA_EMBED:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int save_models(const llm_arch target_arch, const size_t seed, const ggml_log_level log_level, const std::string & dir) {
+ GGML_ABORT("llama_model_save_to_file is broken");
+ struct user_data_t {
+ struct {
+ ggml_log_callback callback;
+ void * user_data;
+ } original_logger;
+ ggml_log_level min_level; // prints below this log level go to debug log
+ };
+ user_data_t ud;
+ llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
+ ud.min_level = log_level;
+
+ llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
+ const user_data_t * ud = (const user_data_t *) user_data;
+ const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
+ ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
+ }, &ud);
+
+ for (const llm_arch & arch : llm_arch_all()) {
+ if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) {
+ continue;
+ }
+ if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) {
+ continue; // These models don't have usable implementations.
+ }
+ for (bool moe : {false, true}) {
+ if (moe && !moe_implemented(arch)) {
+ continue;
+ }
+ if (!moe && moe_mandatory(arch)) {
+ continue;
+ }
+ gguf_context_ptr gguf_ctx = get_gguf_ctx(arch, moe);
+ auto model_and_ctx = get_model_and_ctx(gguf_ctx.get(), seed, {});
+ const std::string path = dir + "/" + llm_arch_name(arch) + (moe ? "-moe.gguf" : "-dense.gguf");
+ LOG_INF("%s: Saving %s model (%s) to %s...\n", __func__, llm_arch_name(arch), moe ? "MoE" : "dense", path.c_str());
+ llama_model_save_to_file(model_and_ctx.first.get(), path.c_str());
+ }
+ }
+ llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
+ return 0;
+}
+
+static int test_backends(const llm_arch target_arch, const size_t seed, const ggml_log_level log_level) {
+ struct user_data_t {
+ struct {
+ ggml_log_callback callback;
+ void * user_data;
+ } original_logger;
+ ggml_log_level min_level; // prints below this log level go to debug log
+ };
+ user_data_t ud;
+ llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
+ ud.min_level = log_level;
+
+ llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
+ const user_data_t * ud = (const user_data_t *) user_data;
+ const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
+ ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
+ }, &ud);
+
+ const std::vector<llama_token> tokens = get_tokens(128, 128, seed);
+
+ bool all_ok = true;
+ common_log_flush(common_log_main());
+ printf("|%15s|%30s|%6s|%8s|%6s|\n", "Model arch.", "Device", "Config", "NMSE", "Status");
+ printf("|---------------|------------------------------|------|--------|------|\n");
+ for (const llm_arch & arch : llm_arch_all()) {
+ if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) {
+ continue;
+ }
+ if (arch == LLM_ARCH_CLIP || arch == LLM_ARCH_GPTJ || arch == LLM_ARCH_UNKNOWN) {
+ continue; // These models don't have usable implementations.
+ }
+ if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
+ continue; // FIXME CUDA backend crashes.
+ }
+ if (arch == LLM_ARCH_LLAMA_EMBED || arch == LLM_ARCH_GEMMA_EMBEDDING || arch == LLM_ARCH_T5ENCODER) {
+ continue; // FIXME Embedding (?) models produce inconsistent results.
+ }
+ if (arch == LLM_ARCH_RWKV6 || arch == LLM_ARCH_RWKV6QWEN2 || arch == LLM_ARCH_RWKV7 || arch == LLM_ARCH_ARWKV7) {
+ continue; // FIXME RWKV models hang indefinitely.
+ }
+ if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_MODERN_BERT || arch == LLM_ARCH_NOMIC_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE ||
+ arch == LLM_ARCH_NEO_BERT || arch == LLM_ARCH_JINA_BERT_V2 || arch == LLM_ARCH_JINA_BERT_V3 || arch == LLM_ARCH_EUROBERT) {
+ continue; // TODO vocab
+ }
+ if (arch == LLM_ARCH_PLM) {
+ continue; // TODO tensor shapes
+ }
+
+ // FIXME some models are segfaulting with WebGPU:
+#ifdef GGML_USE_WEBGPU
+ if (arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE || arch == LLM_ARCH_KIMI_LINEAR) {
+ continue;
+ }
+#endif // GGML_USE_WEBGPU
+
+ const bool encode = arch == LLM_ARCH_T5;
+ for (bool moe : {false, true}) {
+ if (moe && !moe_implemented(arch)) {
+ continue;
+ }
+ if (!moe && moe_mandatory(arch)) {
+ continue;
+ }
+ gguf_context_ptr gguf_ctx = get_gguf_ctx(arch, moe);
+ auto model_and_ctx_cpu = get_model_and_ctx(gguf_ctx.get(), seed, {});
+ const std::vector<float> logits_cpu = get_logits(model_and_ctx_cpu.first.get(), model_and_ctx_cpu.second.get(), tokens, encode);
+ for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+ ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+ if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
+ continue;
+ }
+ auto model_and_ctx_dev = get_model_and_ctx(gguf_ctx.get(), seed, {dev});
+ const std::vector<float> logits_dev = get_logits(model_and_ctx_dev.first.get(), model_and_ctx_dev.second.get(), tokens, encode);
+ const double nmse_val = nmse(logits_cpu, logits_dev);
+ const bool ok = nmse_val <= 1e-4;
+ all_ok = all_ok && ok;
+ char nmse_str[10];
+ snprintf(nmse_str, sizeof(nmse_str), "%.2e", nmse_val);
+ printf("|%15s|%30s|%6s|%8s|%17s|\n", llm_arch_name(arch), ggml_backend_dev_description(dev),
+ moe ? "MoE" : "Dense", nmse_str, ok ? "\033[1;32mOK\033[0m" : "\033[1;31mFAIL\033[0m");
+ }
+ }
+ }
+ llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
+ return all_ok ? 0 : 1;
+}
+
+int main(int argc, char ** argv) {
+ // FIXME these tests are disabled in the CI for macOS-latest-cmake-arm64 because they are segfaulting
+ common_init();
+ std::random_device rd;
+
+ llm_arch arch = LLM_ARCH_UNKNOWN;
+ size_t seed = rd();
+ ggml_log_level log_level = GGML_LOG_LEVEL_ERROR;
+ std::string out;
+
+ for (int i = 1; i < argc; i++) {
+ if (strcmp(argv[i], "-a") == 0 || strcmp(argv[i], "--arch") == 0) {
+ if (i + 1 < argc) {
+ const std::string arch_name = argv[++i];
+ arch = llm_arch_from_string(arch_name);
+ if (arch == LLM_ARCH_UNKNOWN) {
+ LOG_ERR("%s: unkown LLM architecture: %s\n", __func__, arch_name.c_str());
+ return 1;
+ }
+ } else {
+ usage(argv);
+ return 1;
+ }
+ }
+ if (strcmp(argv[i], "-s") == 0 || strcmp(argv[i], "--seed") == 0) {
+ if (i + 1 < argc) {
+ seed = std::stoull(argv[++i]);
+ } else {
+ usage(argv);
+ return 1;
+ }
+ }
+ if (strcmp(argv[i], "-v") == 0 || strcmp(argv[i], "--verbose") == 0) {
+ log_level = GGML_LOG_LEVEL_INFO;
+ continue;
+ }
+ if (strcmp(argv[i], "-o") == 0 || strcmp(argv[i], "--out") == 0) {
+ if (i + 1 < argc) {
+ out = argv[++i];
+ } else {
+ usage(argv);
+ return 1;
+ }
+ }
+ }
+
+ try {
+ if (!out.empty()) {
+ return save_models(arch, seed, log_level, out);
+ }
+ return test_backends(arch, seed, log_level);
+ } catch (const std::exception & err) {
+ fprintf(stderr, "encountered runtime error: %s\n", err.what());
+ return -1;
+ }
+}
add_subdirectory(export-lora)
endif()
add_subdirectory(fit-params)
+ add_subdirectory(results)
endif()
--- /dev/null
+set(TARGET llama-results)
+add_executable(${TARGET} results.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+ install(TARGETS ${TARGET} RUNTIME)
+endif()
--- /dev/null
+# Results
+
+The `llama-results` tool can be used to `--check` the outputs of a model vs. a previous commit to detect whether they have changed.
+Example usage:
+
+``` sh
+llama-results --model model.gguf --output results.gguf --prompt "People die when they are killed." # writes results to file
+llama-results --model model.gguf --output results.gguf --prompt "People die when they are killed." --check # compares results vs file
+```
+
+The metric by which the results are compared is the normalized mean squared error (NMSE) with a tolerance of $10^{-6}$.
--- /dev/null
+#include "ggml-cpp.h"
+#include "ggml.h"
+#include "gguf.h"
+#include "llama.h"
+#include "common.h"
+#include "arg.h"
+#include "log.h"
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+// normalized mean squared error = mse(a, b) / mse(a, 0)
+static double nmse(const std::vector<float> & a, const std::vector<float> & b) {
+ GGML_ASSERT(a.size() == b.size());
+ double mse_a_b = 0.0;
+ double mse_a_0 = 0.0;
+
+ for (size_t i = 0; i < a.size(); i++) {
+ float a_i = a[i];
+ float b_i = b[i];
+
+ mse_a_b += (a_i - b_i) * (a_i - b_i);
+ mse_a_0 += a_i * a_i;
+ }
+
+ return mse_a_b / mse_a_0;
+}
+
+static std::vector<float> get_logits(
+ llama_model * model, llama_context * lctx, const std::vector<llama_token> & tokens) {
+ const uint32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
+ const uint32_t n_ctx = llama_n_ctx(lctx);
+ const uint32_t n_tokens = tokens.size();
+ llama_batch batch = llama_batch_init(n_ctx, 0, 1);
+ GGML_ASSERT(n_tokens <= n_ctx);
+ for (uint32_t pos = 0; pos < n_tokens; pos++) {
+ common_batch_add(batch, tokens[pos], pos, {0}, true);
+ }
+ batch.n_tokens = n_tokens;
+ if (llama_decode(lctx, batch)) {
+ llama_batch_free(batch);
+ throw std::runtime_error("failed to decode batch");
+ }
+
+ std::vector<float> ret;
+ ret.reserve(n_tokens*n_vocab);
+ for (uint32_t i = 0; i < n_tokens; i++) {
+ const float * logits_ith = llama_get_logits_ith(lctx, i);
+ for (uint32_t j = 0; j < n_vocab; j++) {
+ ret.push_back(logits_ith[j]);
+ }
+ }
+ llama_batch_free(batch);
+ return ret;
+}
+
+int main(int argc, char ** argv) {
+ common_params params;
+ params.escape = false;
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_RESULTS)) {
+ return 1;
+ }
+ if (params.out_file.empty()) {
+ LOG_ERR("%s: an output file must be specified", __func__);
+ return 1;
+ }
+ common_init();
+ llama_backend_init();
+ llama_numa_init(params.numa);
+ common_init_result_ptr llama_init = common_init_from_params(params);
+ struct llama_model * model = llama_init->model();
+ struct llama_context * lctx = llama_init->context();
+ if (model == nullptr) {
+ LOG_ERR("%s: unable to load model\n", __func__);
+ return 1;
+ }
+ const uint32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
+
+ const std::vector<llama_token> tokens_calc = common_tokenize(lctx, params.prompt, true);
+ const std::vector<float> logits_calc = get_logits(model, lctx, tokens_calc);
+ GGML_ASSERT(logits_calc.size() == tokens_calc.size()*n_vocab);
+
+ struct gguf_init_params gguf_params = {
+ /*.no_alloc =*/ true,
+ /*.ctx =*/ nullptr,
+ };
+ gguf_context_ptr gguf_ctx_model(gguf_init_from_file(params.model.path.c_str(), gguf_params));
+
+ if (params.check) {
+ LOG_INF("%s: loading results from %s...\n", __func__, params.out_file.c_str());
+ gguf_context_ptr gguf_ctx;
+ {
+ struct gguf_init_params gguf_params = {
+ /*no_alloc =*/ true,
+ /*ctx =*/ nullptr,
+ };
+ gguf_ctx.reset(gguf_init_from_file(params.out_file.c_str(), gguf_params));
+ }
+ const std::string path_model_disk = gguf_get_val_str(gguf_ctx.get(), gguf_find_key(gguf_ctx.get(), "path_model"));
+ GGML_ASSERT(path_model_disk == params.model.path); // TODO better checks
+
+ auto load_tensor_data = [&](const std::string & name, void * dst, const size_t size){
+ const int64_t tid = gguf_find_tensor(gguf_ctx.get(), name.c_str());
+ const size_t offset = gguf_get_data_offset(gguf_ctx.get()) + gguf_get_tensor_offset(gguf_ctx.get(), tid);
+ GGML_ASSERT(size == gguf_get_tensor_size(gguf_ctx.get(), tid));
+
+ FILE * file = ggml_fopen(params.out_file.c_str(), "rb");
+ if (file == nullptr) {
+ throw std::runtime_error("failed to open results file");
+ }
+ if (fseek(file, offset, SEEK_SET) != 0) {
+ throw std::runtime_error("fseek failed");
+ }
+ const size_t nbytes_read = fread(dst, 1, size, file);
+ if (nbytes_read != size) {
+ throw std::runtime_error("fread failed");
+ }
+ };
+
+ std::vector<llama_token> tokens_disk(tokens_calc.size());
+ load_tensor_data("tokens", tokens_disk.data(), tokens_disk.size()*sizeof(llama_token));
+ GGML_ASSERT(tokens_disk.size() == tokens_calc.size());
+ for (size_t i = 0; i < tokens_calc.size(); i++) {
+ GGML_ASSERT(tokens_disk[i] == tokens_calc[i]);
+ }
+
+ std::vector<float> logits_disk(logits_calc.size());
+ load_tensor_data("logits", logits_disk.data(), logits_disk.size()*sizeof(float));
+ const double nmse_val = nmse(logits_disk, logits_calc);
+ LOG_INF("%s: NMSE=%.3e\n", __func__, nmse_val);
+
+ if (nmse_val > 1e-6) {
+ printf("\033[1;31mFAIL\033[0m\n");
+ return 1;
+ }
+
+ printf("\033[1;32mOK\033[0m\n");
+ return 0;
+ }
+
+ ggml_context_ptr ggml_ctx_calc;
+ {
+ const size_t size_tokens = tokens_calc.size()*sizeof(llama_token) + ggml_tensor_overhead();
+ const size_t size_logits = logits_calc.size()*sizeof(float) + ggml_tensor_overhead();
+ struct ggml_init_params params = {
+ /*.mem_size =*/ size_tokens + size_logits,
+ /*.mem_buffer =*/ nullptr,
+ /*.no_alloc =*/ false,
+ };
+ ggml_ctx_calc.reset(ggml_init(params));
+ }
+
+ gguf_context_ptr gguf_ctx(gguf_init_empty());
+ gguf_set_val_str(gguf_ctx.get(), "path_model", params.model.path.c_str());
+ {
+ ggml_tensor * t_tokens = ggml_new_tensor_1d(ggml_ctx_calc.get(), GGML_TYPE_I32, tokens_calc.size());
+ ggml_set_name(t_tokens, "tokens");
+ int32_t * tokens_data = (int32_t *) t_tokens->data;
+ for (uint32_t i = 0; i < tokens_calc.size(); i++) {
+ tokens_data[i] = tokens_calc[i];
+ }
+ gguf_add_tensor(gguf_ctx.get(), t_tokens);
+ }
+ {
+ ggml_tensor * t_logits = ggml_new_tensor_2d(ggml_ctx_calc.get(), GGML_TYPE_F32, tokens_calc.size(), n_vocab);
+ ggml_set_name(t_logits, "logits");
+ float * logits_data = ggml_get_data_f32(t_logits);
+ for (uint32_t i = 0; i < tokens_calc.size(); i++) {
+ const float * logits_ith = llama_get_logits_ith(lctx, i);
+ for (uint32_t j = 0; j < n_vocab; j++) {
+ logits_data[i*n_vocab + j] = logits_ith[j];
+ }
+ }
+ gguf_add_tensor(gguf_ctx.get(), t_logits);
+ }
+ LOG_INF("%s: writing results to %s...\n", __func__, params.out_file.c_str());
+ gguf_write_to_file(gguf_ctx.get(), params.out_file.c_str(), /*only_meta =*/ false);
+ return 0;
+}
+