return nullptr;
}
-static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
+static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_lora & adapter) {
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
+ llama_model & model = adapter.model;
+
ggml_context * ctx_init;
gguf_init_params meta_gguf_params = {
/* .no_alloc = */ true,
}
}
+ // update number of nodes used
+ model.n_lora_nodes += adapter.get_n_nodes();
+
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
}
llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
- llama_adapter_lora * adapter = new llama_adapter_lora();
+ llama_adapter_lora * adapter = new llama_adapter_lora(*model);
try {
- llama_adapter_lora_init_impl(*model, path_lora, *adapter);
+ llama_adapter_lora_init_impl(path_lora, *adapter);
return adapter;
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
}
void llama_adapter_lora_free(llama_adapter_lora * adapter) {
+ // update number of nodes used
+ GGML_ASSERT(adapter->model.n_lora_nodes >= adapter->get_n_nodes());
+ adapter->model.n_lora_nodes -= adapter->get_n_nodes();
+
delete adapter;
}
};
struct llama_adapter_lora {
+ llama_model & model;
+
// map tensor name to lora_a_b
std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
// activated lora (aLoRA)
std::vector<llama_token> alora_invocation_tokens;
- llama_adapter_lora() = default;
+ llama_adapter_lora(llama_model & model) : model(model) {}
~llama_adapter_lora() = default;
llama_adapter_lora_weight * get_weight(ggml_tensor * w);
+
+ uint32_t get_n_nodes() const {
+ return ab_map.size() * 6u; // a, b, scale, add, 2 x mul_mat
+ }
};
using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;
{ LLM_ARCH_STARCODER, "starcoder" },
{ LLM_ARCH_REFACT, "refact" },
{ LLM_ARCH_BERT, "bert" },
+ { LLM_ARCH_MODERN_BERT, "modern-bert" },
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
{ LLM_ARCH_NEO_BERT, "neo-bert" },
{ LLM_ARCH_PHIMOE, "phimoe" },
{ LLM_ARCH_PLAMO, "plamo" },
{ LLM_ARCH_PLAMO2, "plamo2" },
+ { LLM_ARCH_PLAMO3, "plamo3" },
{ LLM_ARCH_CODESHELL, "codeshell" },
{ LLM_ARCH_ORION, "orion" },
{ LLM_ARCH_INTERNLM2, "internlm2" },
{ LLM_ARCH_RND1, "rnd1" },
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
{ LLM_ARCH_MISTRAL3, "mistral3" },
+ { LLM_ARCH_MIMO2, "mimo2" },
+ { LLM_ARCH_LLAMA_EMBED, "llama-embed" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
{ LLM_KV_ATTENTION_GATE_LORA_RANK, "%s.attention.gate_lora_rank" },
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
+ { LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, "%s.attention.sliding_window_pattern" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
+ { LLM_KV_ROPE_FREQ_BASE_SWA, "%s.rope.freq_base_swa" },
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
case LLM_ARCH_LLAMA:
case LLM_ARCH_DECI:
case LLM_ARCH_MISTRAL3:
+ case LLM_ARCH_LLAMA_EMBED:
return {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
};
+ case LLM_ARCH_MODERN_BERT:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_TOKEN_EMBD_NORM,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_CLS,
+ LLM_TENSOR_CLS_OUT,
+ };
case LLM_ARCH_JINA_BERT_V2:
return {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_ATTN_POST_NORM,
LLM_TENSOR_FFN_POST_NORM,
};
+ case LLM_ARCH_PLAMO3:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_QKV,
+ LLM_TENSOR_ATTN_Q_NORM,
+ LLM_TENSOR_ATTN_K_NORM,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_ATTN_POST_NORM,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_POST_NORM,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ };
case LLM_ARCH_CODESHELL:
return {
LLM_TENSOR_TOKEN_EMBD,
LLM_TENSOR_VISEXP_FFN_DOWN,
LLM_TENSOR_VISEXP_FFN_UP,
};
+ case LLM_ARCH_MIMO2:
+ return {
+ LLM_TENSOR_TOKEN_EMBD,
+ LLM_TENSOR_OUTPUT_NORM,
+ LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_ATTN_NORM,
+ LLM_TENSOR_ATTN_Q,
+ LLM_TENSOR_ATTN_K,
+ LLM_TENSOR_ATTN_V,
+ LLM_TENSOR_ATTN_SINKS,
+ LLM_TENSOR_ATTN_OUT,
+ LLM_TENSOR_FFN_NORM,
+ LLM_TENSOR_FFN_GATE,
+ LLM_TENSOR_FFN_DOWN,
+ LLM_TENSOR_FFN_UP,
+ LLM_TENSOR_FFN_GATE_INP,
+ LLM_TENSOR_FFN_GATE_EXPS,
+ LLM_TENSOR_FFN_DOWN_EXPS,
+ LLM_TENSOR_FFN_UP_EXPS,
+ LLM_TENSOR_FFN_EXP_PROBS_B,
+ };
case LLM_ARCH_GPTJ:
case LLM_ARCH_UNKNOWN:
return {
LLM_ARCH_STARCODER,
LLM_ARCH_REFACT,
LLM_ARCH_BERT,
+ LLM_ARCH_MODERN_BERT,
LLM_ARCH_NOMIC_BERT,
LLM_ARCH_NOMIC_BERT_MOE,
LLM_ARCH_NEO_BERT,
LLM_ARCH_PHIMOE,
LLM_ARCH_PLAMO,
LLM_ARCH_PLAMO2,
+ LLM_ARCH_PLAMO3,
LLM_ARCH_CODESHELL,
LLM_ARCH_ORION,
LLM_ARCH_INTERNLM2,
LLM_ARCH_RND1,
LLM_ARCH_PANGU_EMBED,
LLM_ARCH_MISTRAL3,
+ LLM_ARCH_MIMO2,
+ LLM_ARCH_LLAMA_EMBED,
LLM_ARCH_UNKNOWN,
};
LLM_KV_ATTENTION_GATE_LORA_RANK,
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW,
+ LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN,
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_OUTPUT_SCALE,
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_SECTIONS,
LLM_KV_ROPE_FREQ_BASE,
+ LLM_KV_ROPE_FREQ_BASE_SWA,
LLM_KV_ROPE_SCALE_LINEAR,
LLM_KV_ROPE_SCALING_TYPE,
LLM_KV_ROPE_SCALING_FACTOR,
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
bool pipeline_parallel =
model.n_devices() > 1 &&
- model.params.n_gpu_layers > (int) model.hparams.n_layer &&
- model.params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
+ model.n_gpu_layers() > model.hparams.n_layer &&
+ model.split_mode() == LLAMA_SPLIT_MODE_LAYER &&
cparams.offload_kqv &&
!model.has_tensor_overrides();
}
llama_context::~llama_context() {
- // FIXME this currently results in a use-after-free bug if the model is freed before the context
- // if (!model.hparams.no_alloc) {
- // for (size_t i = 0; i < backend_ptrs.size(); ++i) {
- // ggml_backend_t backend = backend_ptrs[i];
- // ggml_backend_buffer_type_t buft = backend_buft[i];
-
- // const size_t size_exp = backend_buf_exp_size[i];
- // const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
- // if (size_exp == size_act) {
- // LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
- // __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
- // } else {
- // LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
- // __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
- // }
- // }
- // }
+ if (!model.hparams.no_alloc) {
+ for (size_t i = 0; i < backend_ptrs.size(); ++i) {
+ ggml_backend_t backend = backend_ptrs[i];
+ ggml_backend_buffer_type_t buft = backend_buft[i];
+
+ const size_t size_exp = backend_buf_exp_size[i];
+ const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
+ if (size_exp == size_act) {
+ LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
+ __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
+ } else {
+ LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
+ __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
+ }
+ }
+ }
ggml_opt_free(opt_ctx);
}
if (model.arch == LLM_ARCH_QWEN3NEXT) {
return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
}
- return std::max<uint32_t>(1024u, 8u*model.n_tensors());
+ uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
+ res += model.n_lora_nodes;
+ return res;
}
llm_graph_result * llama_context::get_gf_res_reserve() const {
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
// FIXME: fix in ggml_backend_sched
- const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer;
+ const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer;
if (ubatch.n_tokens < 32 || full_offload) {
if (il != -1 && strcmp(name, "norm") == 0) {
const auto & dev_layer = model.dev_layer(il);
llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
// the size of the sliding window (0 - no SWA)
uint32_t n_swa = 0;
- // if swa_layers[il] == true, then layer il is SWA
- // if swa_layers[il] == false, then layer il is dense (i.e. non-SWA)
+ // if swa_layers[il] == 1, then layer il is SWA
+ // if swa_layers[il] == 0, then layer il is dense (i.e. non-SWA)
// by default, all layers are dense
- std::array<bool, LLAMA_MAX_LAYERS> swa_layers;
+ // note: using uint32_t type for compatibility reason
+ std::array<uint32_t, LLAMA_MAX_LAYERS> swa_layers;
// for State Space Models
uint32_t ssm_d_conv = 0;
bool do_shift,
stream_copy_info sc_info);
- // used to create a batch procesing context from a batch
+ // used to create a batch processing context from a batch
llama_kv_cache_context(
llama_kv_cache * kv,
slot_info_vec_t sinfos,
#ifdef __has_include
#if __has_include(<unistd.h>)
#include <unistd.h>
+ #include <fcntl.h>
+ #include <sys/stat.h>
#if defined(_POSIX_MAPPED_FILES)
#include <sys/mman.h>
- #include <fcntl.h>
#endif
#if defined(_POSIX_MEMLOCK_RANGE)
#include <sys/resource.h>
return ret;
}
- impl(const char * fname, const char * mode) {
+ impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
fp = ggml_fopen(fname, mode);
if (fp == NULL) {
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
write_raw(&val, sizeof(val));
}
+ void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
+ throw std::runtime_error("DirectIO is not implemented on Windows.");
+ }
+
~impl() {
if (fp) {
std::fclose(fp);
}
}
#else
- impl(const char * fname, const char * mode) {
+ impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
+#ifdef __linux__
+ // Try unbuffered I/O for read only
+ if (use_direct_io && std::strcmp(mode, "rb") == 0) {
+ fd = open(fname, O_RDONLY | O_DIRECT);
+
+ if (fd != -1) {
+ struct stat file_stats{};
+ fstat(fd, &file_stats);
+
+ size = file_stats.st_size;
+ alignment = file_stats.st_blksize;
+
+ off_t ret = lseek(fd, 0, SEEK_SET);
+ if (ret == -1) {
+ throw std::runtime_error(format("seek error: %s", strerror(errno)));
+ }
+ return;
+ }
+
+ LLAMA_LOG_WARN("Failed to open model %s with error: %s. Falling back to buffered I/O",
+ fname, strerror(errno));
+ }
+#endif
fp = ggml_fopen(fname, mode);
if (fp == NULL) {
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
}
size_t tell() const {
-// TODO: this ifdef is never true?
-#ifdef _WIN32
- __int64 ret = _ftelli64(fp);
-#else
- long ret = std::ftell(fp);
-#endif
- if (ret == -1) {
- throw std::runtime_error(format("ftell error: %s", strerror(errno)));
+ if (fd == -1) {
+ long ret = std::ftell(fp);
+ if (ret == -1) {
+ throw std::runtime_error(format("ftell error: %s", strerror(errno)));
+ }
+
+ return (size_t) ret;
}
- return (size_t) ret;
+ off_t pos = lseek(fd, 0, SEEK_CUR);
+ if (pos == -1) {
+ throw std::runtime_error(format("lseek error: %s", strerror(errno)));
+ }
+ return (size_t) pos;
}
void seek(size_t offset, int whence) const {
-// TODO: this ifdef is never true?
-#ifdef _WIN32
- int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
- int ret = std::fseek(fp, (long) offset, whence);
-#endif
- if (ret != 0) {
+ off_t ret = 0;
+ if (fd == -1) {
+ ret = std::fseek(fp, (long) offset, whence);
+ } else {
+ ret = lseek(fd, offset, whence);
+ }
+ if (ret == -1) {
throw std::runtime_error(format("seek error: %s", strerror(errno)));
}
}
return;
}
errno = 0;
- std::size_t ret = std::fread(ptr, len, 1, fp);
- if (ferror(fp)) {
- throw std::runtime_error(format("read error: %s", strerror(errno)));
+ if (fd == -1) {
+ std::size_t ret = std::fread(ptr, len, 1, fp);
+ if (ferror(fp)) {
+ throw std::runtime_error(format("read error: %s", strerror(errno)));
+ }
+ if (ret != 1) {
+ throw std::runtime_error("unexpectedly reached end of file");
+ }
+ } else {
+ bool successful = false;
+ while (!successful) {
+ off_t ret = read(fd, ptr, len);
+
+ if (ret == -1) {
+ if (errno == EINTR) {
+ continue; // Interrupted by signal, retry
+ }
+ throw std::runtime_error(format("read error: %s", strerror(errno)));
+ }
+ if (ret == 0) {
+ throw std::runtime_error("unexpectedly reached end of file");
+ }
+
+ successful = true;
+ }
}
- if (ret != 1) {
- throw std::runtime_error("unexpectedly reached end of file");
+ }
+
+ void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
+ off_t aligned_offset = offset & ~(alignment - 1);
+ off_t offset_from_alignment = offset - aligned_offset;
+ size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
+
+ void * raw_buffer = nullptr;
+ int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read);
+ if (ret != 0) {
+ throw std::runtime_error(format("posix_memalign failed with error %d", ret));
}
+
+ struct aligned_buffer_deleter {
+ void operator()(void * p) const { free(p); }
+ };
+ std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
+
+ seek(aligned_offset, SEEK_SET);
+ read_raw(buffer.get(), bytes_to_read);
+
+ uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
+ memcpy(dest, reinterpret_cast<void *>(actual_data), size);
}
uint32_t read_u32() const {
}
~impl() {
- if (fp) {
+ if (fd != -1) {
+ close(fd);
+ } else {
std::fclose(fp);
}
}
+ int fd = -1;
#endif
- FILE * fp;
- size_t size;
+ void read_raw_at(void * ptr, size_t len, size_t offset) const {
+ if (alignment != 1) {
+ read_aligned_chunk(offset, ptr, len);
+ } else {
+ seek(offset, SEEK_SET);
+ read_raw(ptr, len);
+ }
+ }
+
+ size_t read_alignment() const {
+ return alignment;
+ }
+
+ size_t alignment = 1;
+
+ FILE * fp{};
+ size_t size{};
};
-llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique<impl>(fname, mode)) {}
+llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
+ pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
llama_file::~llama_file() = default;
size_t llama_file::tell() const { return pimpl->tell(); }
size_t llama_file::size() const { return pimpl->size; }
+size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
+
int llama_file::file_id() const {
#ifdef _WIN32
return _fileno(pimpl->fp);
void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
+void llama_file::read_raw_at(void * ptr, size_t len, size_t offset) const { pimpl->read_raw_at(ptr, len, offset); }
uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }
#include <cstdint>
#include <memory>
#include <vector>
+#include <cstdio>
struct llama_file;
struct llama_mmap;
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
struct llama_file {
- llama_file(const char * fname, const char * mode);
+ llama_file(const char * fname, const char * mode, bool use_direct_io = false);
~llama_file();
size_t tell() const;
void seek(size_t offset, int whence) const;
void read_raw(void * ptr, size_t len) const;
+ void read_raw_at(void * ptr, size_t len, size_t offset) const;
+ void read_aligned_chunk(size_t offset, void * dest, size_t size) const;
uint32_t read_u32() const;
void write_raw(const void * ptr, size_t len) const;
void write_u32(uint32_t val) const;
+ size_t read_alignment() const;
private:
struct impl;
std::unique_ptr<impl> pimpl;
return get_key_or_arr(llm_kv(kid), result, n, required);
}
+ bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) {
+ const std::string key = llm_kv(kid);
+
+ const int id = gguf_find_key(meta.get(), key.c_str());
+
+ if (id < 0) {
+ if (required) {
+ throw std::runtime_error(format("key not found in model: %s", key.c_str()));
+ }
+ return false;
+ }
+
+ // throw and error if type is an array
+ if (gguf_get_kv_type(meta.get(), id) == GGUF_TYPE_ARRAY) {
+ if (required) {
+ throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str()));
+ }
+ return false;
+ }
+
+ return get_key(key, result, required);
+ }
+
// TODO: this is not very clever - figure out something better
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
- files.emplace_back(new llama_file(fname.c_str(), "rb"));
+ files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap));
contexts.emplace_back(ctx);
// Save tensors data offset of the main file.
}
}
- files.emplace_back(new llama_file(fname_split, "rb"));
+ files.emplace_back(new llama_file(fname_split, "rb", !use_mmap));
contexts.emplace_back(ctx);
// Save tensors data offset info of the shard.
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
// NVMe raid configurations might require more / larger buffers.
constexpr size_t n_buffers = 4;
- constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
+
+ size_t alignment = 1;
+ for (const auto & file : files) {
+ alignment = std::max(file->read_alignment(), alignment);
+ }
+
+ // Buffer size: balance between memory usage and I/O efficiency
+ // 64MB works well for NVMe drives
+ const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024;
std::vector<ggml_backend_buffer_t> host_buffers;
std::vector<ggml_backend_event_t> events;
// If the backend is supported, create pinned memory buffers and events for synchronisation.
for (size_t idx = 0; idx < n_buffers; ++idx) {
auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
+
if (!buf) {
LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
ggml_backend_dev_name(dev));
}
} else {
const auto & file = files.at(weight->idx);
+
if (ggml_backend_buffer_is_host(cur->buffer)) {
- file->seek(weight->offs, SEEK_SET);
- file->read_raw(cur->data, n_size);
+ file->read_raw_at(cur->data, n_size, weight->offs);
if (check_tensors) {
validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
} else {
// If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
if (upload_backend) {
- file->seek(weight->offs, SEEK_SET);
+ size_t offset = weight->offs;
+ alignment = file->read_alignment();
+ size_t aligned_offset = offset & ~(alignment - 1);
+ size_t offset_from_alignment = offset - aligned_offset;
+ file->seek(aligned_offset, SEEK_SET);
+
+ // Calculate aligned read boundaries
+ size_t read_start = aligned_offset;
+ size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);
size_t bytes_read = 0;
+ size_t data_read = 0; // Actual tensor data copied (excluding padding)
- while (bytes_read < n_size) {
- size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
+ while (bytes_read < read_end - read_start) {
+ size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);
+ // Align the destination pointer within the pinned buffer
+ uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);
+
+ // Wait for previous upload to complete before reusing buffer
ggml_backend_event_synchronize(events[buffer_idx]);
- file->read_raw(host_ptrs[buffer_idx], read_iteration);
- ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
+
+ // Read aligned chunk from file
+ file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
+
+ // Calculate actual data portion (excluding alignment padding)
+ uintptr_t ptr_data = ptr_dest_aligned;
+ size_t data_to_copy = read_size;
+
+ // Skip alignment padding at start of first chunk
+ if (bytes_read == 0) {
+ ptr_data += offset_from_alignment;
+ data_to_copy -= offset_from_alignment;
+ }
+
+ // Trim alignment padding at end of last chunk
+ if (aligned_offset + bytes_read + read_size > offset + n_size) {
+ data_to_copy -= (read_end - (offset + n_size));
+ }
+
+ // Async upload actual data to GPU
+ ggml_backend_tensor_set_async(upload_backend, cur,
+ reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
ggml_backend_event_record(events[buffer_idx], upload_backend);
- bytes_read += read_iteration;
+ data_read += data_to_copy;
+ bytes_read += read_size;
+
++buffer_idx;
buffer_idx %= n_buffers;
}
} else {
read_buf.resize(n_size);
- file->seek(weight->offs, SEEK_SET);
- file->read_raw(read_buf.data(), n_size);
+ file->read_raw_at(read_buf.data(), n_size, weight->offs);
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
template<typename T>
bool get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required = true);
+ bool get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required = true);
+
std::string get_arch_name() const;
enum llm_arch get_arch() const;
case LLM_TYPE_17M: return "17M";
case LLM_TYPE_22M: return "22M";
case LLM_TYPE_33M: return "33M";
+ case LLM_TYPE_47M: return "47M";
case LLM_TYPE_60M: return "60M";
case LLM_TYPE_70M: return "70M";
case LLM_TYPE_80M: return "80M";
case LLM_TYPE_109M: return "109M";
case LLM_TYPE_137M: return "137M";
case LLM_TYPE_140M: return "140M";
+ case LLM_TYPE_149M: return "149M";
case LLM_TYPE_160M: return "160M";
case LLM_TYPE_190M: return "190M";
case LLM_TYPE_220M: return "220M";
case LLM_TYPE_335M: return "335M";
case LLM_TYPE_350M: return "350M";
case LLM_TYPE_360M: return "360M";
+ case LLM_TYPE_395M: return "395M";
case LLM_TYPE_410M: return "410M";
case LLM_TYPE_450M: return "450M";
case LLM_TYPE_475M: return "475M";
case LLM_TYPE_230B_A10B: return "230B.A10B";
case LLM_TYPE_235B_A22B: return "235B.A22B";
case LLM_TYPE_300B_A47B: return "300B.A47B";
+ case LLM_TYPE_310B_A15B: return "310B.A15B";
case LLM_TYPE_355B_A32B: return "355B.A32B";
case LLM_TYPE_E2B: return "E2B";
case LLM_TYPE_E4B: return "E4B";
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
- if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
+ if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON || arch == LLM_ARCH_LLAMA_EMBED) {
if (hparams.n_rot != hparams.n_embd_head_k) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
}
// arch-specific KVs
switch (arch) {
case LLM_ARCH_LLAMA:
+ case LLM_ARCH_LLAMA_EMBED:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
default: type = LLM_TYPE_UNKNOWN;
}
} break;
+ case LLM_ARCH_MODERN_BERT:
+ {
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+ if (found_swa && hparams.n_swa > 0) {
+ uint32_t swa_period = 3;
+ hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
+
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+ hparams.set_swa_pattern(swa_period);
+ } else {
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+ }
+
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+ ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
+ ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
+
+ switch (hparams.n_layer) {
+ case 12:
+ type = LLM_TYPE_47M; break; // granite-embedding-small
+ case 22:
+ type = LLM_TYPE_149M; break; // modern-bert-base
+ case 28:
+ type = LLM_TYPE_395M; break; // modern-bert-large
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
case LLM_ARCH_JINA_BERT_V2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
} break;
+ case LLM_ARCH_PLAMO3:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+ if (found_swa && hparams.n_swa > 0) {
+ uint32_t swa_period = 8;
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+ hparams.rope_freq_scale_train_swa = 1.0f;
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
+ hparams.set_swa_pattern(swa_period);
+ } else {
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
+ }
+
+ switch (hparams.n_layer) {
+ case 24: type = LLM_TYPE_2B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
case LLM_ARCH_GPT2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
default: type = LLM_TYPE_UNKNOWN;
}
} break;
+ case LLM_ARCH_MIMO2:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
+ ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
+ ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
+
+ switch (hparams.n_layer) {
+ case 48: type = LLM_TYPE_310B_A15B; break;
+ default: type = LLM_TYPE_UNKNOWN;
+ }
+ } break;
default: throw std::runtime_error("unsupported model architecture");
}
bool llama_model::load_tensors(llama_model_loader & ml) {
const auto & split_mode = params.split_mode;
- const auto & n_gpu_layers = params.n_gpu_layers;
const auto & use_mlock = params.use_mlock;
const auto & tensor_split = params.tensor_split;
- const int n_layer = hparams.n_layer;
+ const int n_layer = hparams.n_layer;
+ const int n_gpu_layers = this->n_gpu_layers();
const bool use_mmap_buffer = true;
if (cpu_dev == nullptr) {
throw std::runtime_error(format("%s: no CPU backend found", __func__));
}
- const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
- const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
+ const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
+ const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
- const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
+ const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il);
if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
return {cpu_dev, &pimpl->cpu_buft_list};
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_MISTRAL3:
+ case LLM_ARCH_LLAMA_EMBED:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
}
} break;
+ case LLM_ARCH_MODERN_BERT:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+ tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
+
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+
+ for(int i = 0; i < n_layer; ++i) {
+ auto& layer = layers[i];
+
+ if ( i != 0 ) {
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ } else{
+ // layer 0 uses identity
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+ }
+
+
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3 * n_embd }, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, 2 * n_ff}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ }
+
+ cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
+ cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
+ cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
+
+ } break;
case LLM_ARCH_NEO_BERT:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
}
} break;
+ case LLM_ARCH_PLAMO3:
+ {
+ const int64_t head_dim_q = hparams.n_embd_head_k;
+ const int64_t head_dim_v = hparams.n_embd_head_v;
+
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+ if (output == NULL) {
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+
+ const int64_t num_attention_heads = hparams.n_head(i);
+ const int64_t num_key_value_heads = hparams.n_head_kv(i);
+ const int64_t q_proj_dim = num_attention_heads * head_dim_q;
+ const int64_t k_proj_dim = num_key_value_heads * head_dim_q;
+ const int64_t v_proj_dim = num_key_value_heads * head_dim_v;
+ const int64_t n_ff_cur = hparams.n_ff(i);
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i),
+ {n_embd,q_proj_dim + k_proj_dim + v_proj_dim}, 0);
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim_q}, 0);
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim_q}, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {num_attention_heads * head_dim_v, n_embd}, 0);
+ layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+ layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0);
+
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff_cur * 2}, 0);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff_cur, n_embd}, 0);
+ }
+ } break;
case LLM_ARCH_GPT2:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
const int64_t n_group = hparams.ssm_n_group;
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
- const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
- const int64_t n_ff_shexp = hparams.n_ff_shexp;
-
// embeddings
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
} else {
if (n_expert != 0) {
+ const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
+ const int64_t n_ff_shexp = hparams.n_ff_shexp;
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0);
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0);
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
}
} break;
+ case LLM_ARCH_MIMO2:
+ {
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+ // output
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = layers[i];
+ uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
+ uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
+ uint32_t n_head = hparams.n_head(i);
+
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_v * n_head, n_embd }, 0);
+
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+ layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, TENSOR_NOT_REQUIRED);
+
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+
+ // non-MoE branch
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, TENSOR_NOT_REQUIRED);
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
+
+ // MoE branch
+ int64_t n_ff_exp = hparams.n_ff_exp;
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, TENSOR_NOT_REQUIRED);
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
+ }
+ } break;
default:
throw std::runtime_error("unknown architecture");
}
if (llama_supports_gpu_offload()) {
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
- LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
- if (n_gpu_layers > (int) hparams.n_layer) {
+ int n_repeating = n_gpu;
+ if (n_repeating > 0) {
LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
+ n_repeating--;
}
+ LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating);
const int max_backend_supported_layers = hparams.n_layer + 1;
const int max_offloadable_layers = hparams.n_layer + 1;
return devices.size();
}
+uint32_t llama_model::n_gpu_layers() const {
+ return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer + 1;
+}
+
+llama_split_mode llama_model::split_mode() const {
+ return params.split_mode;
+}
+
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, size_t> ret;
for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
case LLM_ARCH_NOMIC_BERT_MOE:
case LLM_ARCH_NEO_BERT:
case LLM_ARCH_WAVTOKENIZER_DEC:
+ case LLM_ARCH_MODERN_BERT:
case LLM_ARCH_GEMMA_EMBEDDING:
case LLM_ARCH_DREAM:
case LLM_ARCH_LLADA:
switch (arch) {
case LLM_ARCH_LLAMA:
{
- llm = std::make_unique<llm_build_llama>(*this, params);
+ llm = std::make_unique<llm_build_llama<false>>(*this, params);
} break;
case LLM_ARCH_LLAMA4:
{
if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) {
- llm = std::make_unique<llm_build_llama>(*this, params);
+ llm = std::make_unique<llm_build_llama<false>>(*this, params);
} else {
llm = std::make_unique<llm_build_llama_iswa>(*this, params);
}
} break;
+ case LLM_ARCH_LLAMA_EMBED:
+ {
+ llm = std::make_unique<llm_build_llama<true>>(*this, params);
+ } break;
case LLM_ARCH_DECI:
{
llm = std::make_unique<llm_build_deci>(*this, params);
{
llm = std::make_unique<llm_build_bert>(*this, params);
} break;
+ case LLM_ARCH_MODERN_BERT:
+ {
+ llm = std::make_unique<llm_build_modern_bert<true>>(*this, params);
+ } break;
case LLM_ARCH_NEO_BERT:
{
llm = std::make_unique<llm_build_neo_bert>(*this, params);
{
llm = std::make_unique<llm_build_plamo2>(*this, params);
} break;
+ case LLM_ARCH_PLAMO3:
+ {
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
+ llm = std::make_unique<llm_build_plamo3<true>> (*this, params);
+ } else {
+ llm = std::make_unique<llm_build_plamo3<false>>(*this, params);
+ }
+ } break;
case LLM_ARCH_GPT2:
{
llm = std::make_unique<llm_build_gpt2>(*this, params);
{
llm = std::make_unique<llm_build_mistral3>(*this, params);
} break;
+ case LLM_ARCH_MIMO2:
+ {
+ llm = std::make_unique<llm_build_mimo2_iswa>(*this, params);
+ } break;
default:
GGML_ABORT("fatal error");
}
llama_model_params result = {
/*.devices =*/ nullptr,
/*.tensor_buft_overrides =*/ nullptr,
- /*.n_gpu_layers =*/ 999,
+ /*.n_gpu_layers =*/ -1,
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
/*.main_gpu =*/ 0,
/*.tensor_split =*/ nullptr,
case LLM_ARCH_ERNIE4_5:
case LLM_ARCH_ERNIE4_5_MOE:
case LLM_ARCH_MISTRAL3:
+ case LLM_ARCH_LLAMA_EMBED:
return LLAMA_ROPE_TYPE_NORM;
// the pairs of head values are offset by n_rot/2
case LLM_ARCH_DBRX:
case LLM_ARCH_BERT:
case LLM_ARCH_JINA_BERT_V3:
+ case LLM_ARCH_MODERN_BERT:
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_NOMIC_BERT_MOE:
case LLM_ARCH_STABLELM:
case LLM_ARCH_PHIMOE:
case LLM_ARCH_PLAMO:
case LLM_ARCH_PLAMO2:
+ case LLM_ARCH_PLAMO3:
case LLM_ARCH_GEMMA:
case LLM_ARCH_GEMMA2:
case LLM_ARCH_GEMMA3:
case LLM_ARCH_PANGU_EMBED:
case LLM_ARCH_AFMOE:
case LLM_ARCH_QWEN3NEXT:
+ case LLM_ARCH_MIMO2:
return LLAMA_ROPE_TYPE_NEOX;
case LLM_ARCH_QWEN2VL:
LLM_TYPE_17M,
LLM_TYPE_22M,
LLM_TYPE_33M,
+ LLM_TYPE_47M,
LLM_TYPE_60M,
LLM_TYPE_70M,
LLM_TYPE_80M,
LLM_TYPE_109M,
LLM_TYPE_137M,
LLM_TYPE_140M,
+ LLM_TYPE_149M,
LLM_TYPE_160M,
LLM_TYPE_190M,
LLM_TYPE_220M,
LLM_TYPE_335M,
LLM_TYPE_350M,
LLM_TYPE_360M,
+ LLM_TYPE_395M,
LLM_TYPE_410M,
LLM_TYPE_450M,
LLM_TYPE_475M,
LLM_TYPE_230B_A10B, // Minimax M2
LLM_TYPE_235B_A22B,
LLM_TYPE_300B_A47B, // Ernie MoE big
+ LLM_TYPE_310B_A15B, // /MiMo-V2-Flash
LLM_TYPE_355B_A32B, // GLM-4.5
LLM_TYPE_E2B,
LLM_TYPE_E4B,
struct ggml_tensor * dense_2_out_layers = nullptr;
struct ggml_tensor * dense_3_out_layers = nullptr;
- llama_model_params params;
-
// gguf metadata
std::unordered_map<std::string, std::string> gguf_kv;
// for quantize-stats only
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
+ // for keeping track of extra nodes used by lora adapters
+ uint32_t n_lora_nodes = 0;
+
int64_t t_load_us = 0;
int64_t t_start_us = 0;
size_t n_tensors() const;
size_t n_devices() const;
+ uint32_t n_gpu_layers() const;
+ llama_split_mode split_mode() const;
+
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const;
// total number of parameters in the model
ggml_cgraph * build_graph(const llm_graph_params & params) const;
private:
+ llama_model_params params;
+
struct impl;
std::unique_ptr<impl> pimpl;
};
}
void llama_sampler_accept(struct llama_sampler * smpl, llama_token token) {
+ if (!smpl) {
+ return;
+ }
+
if (smpl->iface->accept) {
smpl->iface->accept(smpl, token);
}
}
void llama_sampler_apply(struct llama_sampler * smpl, struct llama_token_data_array * cur_p) {
+ if (!smpl) {
+ return;
+ }
+
GGML_ASSERT(smpl->iface->apply);
smpl->iface->apply(smpl, cur_p);
}
void llama_sampler_reset(struct llama_sampler * smpl) {
+ if (!smpl) {
+ return;
+ }
+
if (smpl->iface->reset) {
smpl->iface->reset(smpl);
}
}
struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) {
+ if (!smpl) {
+ return nullptr;
+ }
+
if (smpl->iface->clone) {
return smpl->iface->clone(smpl);
}
delete smpl;
}
-llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) {
- const auto * logits = llama_get_logits_ith(ctx, idx);
-
- const llama_model * model = llama_get_model(ctx);
- const llama_vocab * vocab = llama_model_get_vocab(model);
-
- const int n_vocab = llama_vocab_n_tokens(vocab);
-
- // TODO: do not allocate each time
- std::vector<llama_token_data> cur;
- cur.reserve(n_vocab);
- for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
- cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
- }
-
- llama_token_data_array cur_p = {
- /* .data = */ cur.data(),
- /* .size = */ cur.size(),
- /* .selected = */ -1,
- /* .sorted = */ false,
- };
-
- llama_sampler_apply(smpl, &cur_p);
-
- GGML_ASSERT(cur_p.selected >= 0 && cur_p.selected < (int32_t) cur_p.size);
-
- auto token = cur_p.data[cur_p.selected].id;
-
- llama_sampler_accept(smpl, token);
-
- return token;
-}
-
// sampler chain
static const char * llama_sampler_chain_name(const struct llama_sampler * /*smpl*/) {
/* .ctx = */ new llama_sampler_chain {
/* .params = */ params,
/* .samplers = */ {},
+ /* .cur = */ {},
/* .t_sample_us = */ 0,
/* .n_sample = */ 0,
}
);
}
+llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) {
+ const auto * logits = llama_get_logits_ith(ctx, idx);
+
+ const llama_model * model = llama_get_model(ctx);
+ const llama_vocab * vocab = llama_model_get_vocab(model);
+
+ const int n_vocab = llama_vocab_n_tokens(vocab);
+
+ // use pre-allocated buffer from chain if available, otherwise allocate locally
+ std::vector<llama_token_data> * cur_ptr;
+ std::vector<llama_token_data> cur_local;
+
+ if (smpl->iface == &llama_sampler_chain_i) {
+ auto * chain = (llama_sampler_chain *) smpl->ctx;
+ cur_ptr = &chain->cur;
+ } else {
+ cur_ptr = &cur_local;
+ }
+
+ auto & cur = *cur_ptr;
+ cur.resize(n_vocab);
+ for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+ cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
+ }
+
+ llama_token_data_array cur_p = {
+ /* .data = */ cur.data(),
+ /* .size = */ cur.size(),
+ /* .selected = */ -1,
+ /* .sorted = */ false,
+ };
+
+ llama_sampler_apply(smpl, &cur_p);
+
+ GGML_ASSERT(cur_p.selected >= 0 && cur_p.selected < (int32_t) cur_p.size);
+
+ auto token = cur_p.data[cur_p.selected].id;
+
+ llama_sampler_accept(smpl, token);
+
+ return token;
+}
+
void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) {
auto * p = (llama_sampler_chain *) chain->ctx;
p->samplers.push_back(smpl);
std::vector<struct llama_sampler *> samplers;
+ // pre-allocated buffer for llama_sampler_sample to avoid repeated allocations
+ std::vector<llama_token_data> cur;
+
// timing
mutable int64_t t_sample_us;
tokenizer_pre == "jina-v2-es" ||
tokenizer_pre == "jina-v2-de" ||
tokenizer_pre == "a.x-4.0" ||
- tokenizer_pre == "mellum") {
+ tokenizer_pre == "mellum" ||
+ tokenizer_pre == "modern-bert" ) {
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
} else if (
tokenizer_pre == "jina-v1-en" ||
for (const auto * token : {"<unk>", "<s>", "<|endoftext|>"}) {
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
}
+ } else if (_contains_any(model_name, {"modern-bert"})) {
+ if (token_to_id.count("[MASK]") == 0 ) {
+ LLAMA_LOG_WARN("%s: Mask token missing in vocab!\n", __func__);
+ }
+ else {
+ _set_token_attr("[MASK]", LLAMA_TOKEN_ATTR_LSTRIP, true);
+ }
}
}
}
};
// this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
+class llama_params_fit_exception : public std::runtime_error {
+ using std::runtime_error::runtime_error;
+};
+
static void llama_params_fit_impl(
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
}
}
- int64_t sum_total = 0;
+ int64_t sum_free = 0;
int64_t sum_projected_free = 0;
int64_t min_projected_free = INT64_MAX;
int64_t sum_projected_used = 0;
int64_t sum_projected_model = 0;
- int64_t sum_projected_ctx = 0;
if (nd > 1) {
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
const int64_t projected_used = dmd.mb.total();
const int64_t projected_free = dmd.free - projected_used;
- sum_total += dmd.total;
+ sum_free += dmd.free;
sum_projected_used += projected_used;
sum_projected_free += projected_free;
min_projected_free = std::min(min_projected_free, projected_free);
sum_projected_model += dmd.mb.model;
- sum_projected_ctx += dmd.mb.context;
if (nd > 1) {
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n",
projected_free >= 0 ? "surplus" : "deficit");
}
}
- assert(sum_total >= 0 && sum_projected_used >= 0 && sum_projected_ctx >= 0);
- assert(sum_projected_used >= sum_projected_ctx);
+ assert(sum_free >= 0 && sum_projected_used >= 0);
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
- __func__, sum_projected_used/MiB, sum_total/MiB);
+ __func__, sum_projected_used/MiB, sum_free/MiB);
if (min_projected_free >= margin) {
if (nd == 1) {
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
__func__, margin/MiB, -global_surplus/MiB);
if (cparams->n_ctx == 0) {
if (hp_nct > n_ctx_min) {
- const int64_t bytes_per_ctx = sum_projected_ctx / hp_nct;
-
- int64_t memory_reduction = -global_surplus;
+ int64_t sum_used_target = sum_free - nd*margin_s;
if (nd > 1) {
// for multiple devices we need to be more conservative in terms of how much context we think can fit:
// - for dense models only whole layers can be assigned to devices
// - on average we expect a waste of 0.5 layers/tensors per device
// - use slightly more than the expected average for nd devices to be safe
const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
- memory_reduction += (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
+ sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
}
- uint32_t ctx_reduction = std::min(uint32_t((memory_reduction + bytes_per_ctx - 1) / bytes_per_ctx), hp_nct - n_ctx_min);
- cparams->n_ctx = hp_nct - ctx_reduction;
- cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
-
- ctx_reduction = hp_nct - cparams->n_ctx;
- memory_reduction = ctx_reduction * bytes_per_ctx;
- global_surplus += memory_reduction;
- LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
- __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
- if (global_surplus >= 0) {
+ int64_t sum_projected_used_min_ctx = 0;
+ cparams->n_ctx = n_ctx_min;
+ const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
+ for (const auto & dmd : dmds_min_ctx) {
+ sum_projected_used_min_ctx += dmd.mb.total();
+ }
+ if (sum_used_target > sum_projected_used_min_ctx) {
+ // linear interpolation between minimum and maximum context size:
+ cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx)
+ / (sum_projected_used - sum_projected_used_min_ctx);
+ cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
+
+ const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
+ const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
+ LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
+ __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
if (nd == 1) {
LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
return;
}
LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__);
+ } else {
+ const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
+ LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
+ __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
}
} else {
LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
}
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
- throw std::runtime_error("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
+ throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
}
if (nd > 1) {
if (!tensor_split) {
- throw std::runtime_error("did not provide a buffer to write the tensor_split to, abort");
+ throw llama_params_fit_exception("did not provide a buffer to write the tensor_split to, abort");
}
if (mparams->tensor_split) {
for (size_t id = 0; id < nd; id++) {
if (mparams->tensor_split[id] != 0.0f) {
- throw std::runtime_error("model_params::tensor_split already set by user, abort");
+ throw llama_params_fit_exception("model_params::tensor_split already set by user, abort");
}
}
}
if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
- throw std::runtime_error("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
- }
- if (hp_ngl < 2*nd) {
- throw std::runtime_error("model has only " + std::to_string(hp_ngl) + " layers but need at least "
- + std::to_string(2*nd) + " to fit memory for " + std::to_string(nd) + " devices, abort");
+ throw llama_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
}
}
if (!tensor_buft_overrides) {
- throw std::runtime_error("did not provide buffer to set tensor_buft_overrides, abort");
+ throw llama_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort");
}
if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
- throw std::runtime_error("model_params::tensor_buft_overrides already set by user, abort");
+ throw llama_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort");
}
// step 3: iteratively fill the back to front with "dense" layers
auto set_ngl_tensor_split_tbo = [&](
const std::vector<ngl_t> & ngl_per_device,
const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
- llama_model_params & mparams,
- const bool add_nonrepeating) {
+ llama_model_params & mparams) {
mparams.n_gpu_layers = 0;
for (size_t id = 0; id < nd; id++) {
mparams.n_gpu_layers += ngl_per_device[id].n_layer;
tensor_split[id] = ngl_per_device[id].n_layer;
}
}
- assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl);
- uint32_t il0 = hp_ngl - mparams.n_gpu_layers; // start index for tensor buft overrides
+ assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1);
+ uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides
- if (add_nonrepeating) {
- mparams.n_gpu_layers += 1;
- tensor_split[nd - 1] += 1;
- }
mparams.tensor_split = tensor_split;
size_t itbo = 0;
tensor_buft_overrides[itbo].buft = nullptr;
itbo++;
mparams.tensor_buft_overrides = tensor_buft_overrides;
- throw std::runtime_error("llama_params_fit_n_tensor_buft_overrides() == "
- + std::to_string(ntbo) + " is insufficient for model\n");
+ throw llama_params_fit_exception("llama_max_tensor_buft_overrides() == "
+ + std::to_string(ntbo) + " is insufficient for model");
}
tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
tensor_buft_overrides[itbo].buft = overflow_bufts[id];
auto get_memory_for_layers = [&](
const char * func_name,
const std::vector<ngl_t> & ngl_per_device,
- const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
- const bool add_nonrepeating) -> std::vector<int64_t> {
+ const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> {
llama_model_params mparams_copy = *mparams;
- set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy, add_nonrepeating);
+ set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
const dmds_t dmd_nl = llama_get_device_memory_data(
path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
}
- // whether for the optimal memory use we expect to load at least some MoE tensors:
- const bool partial_moe = hp_nex > 0 && global_surplus_cpu_moe > 0;
-
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the partial layers of a device overflow to:
overflow_bufts.reserve(nd);
for (size_t id = 0; id < nd - 1; ++id) {
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
std::vector<ngl_t> ngl_per_device(nd);
- std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts, partial_moe);
+ std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
if (hp_nex > 0) {
for (size_t id = 0; id < nd; id++) {
ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
// - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
// - check memory use of our guess, replace either the low or high bound
// - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
+ // - the last device has the output layer, which cannot be a partial layer
if (hp_nex == 0) {
LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__);
} else {
LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__);
}
for (int id = nd - 1; id >= 0; id--) {
- uint32_t n_unassigned = hp_ngl;
+ uint32_t n_unassigned = hp_ngl + 1;
for (size_t jd = id + 1; jd < nd; ++jd) {
assert(n_unassigned >= ngl_per_device[jd].n_layer);
n_unassigned -= ngl_per_device[jd].n_layer;
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
ngl_per_device_high[id].n_layer = n_unassigned;
if (hp_nex > 0) {
- ngl_per_device_high[id].n_part = ngl_per_device_high[id].n_layer;
+ ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1;
}
if (ngl_per_device_high[id].n_layer > 0) {
- std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
+ std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
if (mem_high[id] > targets[id]) {
assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
+ if (hp_nex > 0 && size_t(id) == nd - 1) {
+ delta--;
+ }
LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
while (delta > 1) {
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
if (hp_nex) {
ngl_per_device_test[id].n_part += step_size;
}
- const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
+ const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
if (mem_test[id] <= targets[id]) {
ngl_per_device = ngl_per_device_test;
} else {
assert(ngl_per_device_high[id].n_layer == n_unassigned);
ngl_per_device = ngl_per_device_high;
+ mem = mem_high;
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
}
}
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
}
if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
- set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
+ set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
return;
}
for (size_t id = 0; id <= id_dense_start; id++) {
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
for (size_t jd = id_dense_start; jd < nd; jd++) {
- const uint32_t n_layer_move = ngl_per_device_high[jd].n_layer;
+ const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
ngl_per_device_high[id].n_layer += n_layer_move;
ngl_per_device_high[jd].n_layer -= n_layer_move;
ngl_per_device_high[jd].n_part = 0;
}
size_t id_dense_start_high = nd - 1;
- std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
+ std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
if (mem_high[id] > targets[id]) {
assert(ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part);
break;
}
}
- const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
+ const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
if (mem_test[id] <= targets[id]) {
ngl_per_device = ngl_per_device_test;
}
} else {
ngl_per_device = ngl_per_device_high;
+ mem = mem_high;
id_dense_start = id_dense_start_high;
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
__func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
}
// try to fit at least part of one more layer
- if (ngl_per_device[id_dense_start].n_layer > 0) {
+ if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) {
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
size_t id_dense_start_test = id_dense_start;
ngl_per_device_test[id_dense_start_test].n_layer--;
}
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
- std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
- if (mem_test[id] < targets[id]) {
+ std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
+ if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
ngl_per_device = ngl_per_device_test;
mem = mem_test;
id_dense_start = id_dense_start_test;
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
- mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
- if (mem_test[id] < targets[id]) {
+ mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
+ if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
ngl_per_device = ngl_per_device_test;
mem = mem_test;
id_dense_start = id_dense_start_test;
} else {
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
- mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
- if (mem_test[id] < targets[id]) {
+ mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
+ if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
ngl_per_device = ngl_per_device_test;
mem = mem_test;
id_dense_start = id_dense_start_test;
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
}
- set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
+ set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
}
-bool llama_params_fit(
+enum llama_params_fit_status llama_params_fit(
const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
const int64_t t0_us = llama_time_us();
- bool ok = true;
+ llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
try {
llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
- } catch (const std::runtime_error & e) {
+ } catch (const llama_params_fit_exception & e) {
LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
- ok = false;
+ status = LLAMA_PARAMS_FIT_STATUS_FAILURE;
+ } catch (const std::runtime_error & e) {
+ LLAMA_LOG_ERROR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what());
+ status = LLAMA_PARAMS_FIT_STATUS_ERROR;
}
const int64_t t1_us = llama_time_us();
LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
- return ok;
+ return status;
}
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
// NULL-terminated list of buffer types to use for tensors that match a pattern
const struct llama_model_tensor_buft_override * tensor_buft_overrides;
- int32_t n_gpu_layers; // number of layers to store in VRAM
+ int32_t n_gpu_layers; // number of layers to store in VRAM, a negative value means all layers
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
// Frees all allocated memory
LLAMA_API void llama_free(struct llama_context * ctx);
+ enum llama_params_fit_status {
+ LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
+ LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
+ LLAMA_PARAMS_FIT_STATUS_ERROR = 2, // a hard error occured, e.g. because no model could be found at the specified path
+ };
+
// fits mparams and cparams to free device memory (assumes system memory is unlimited)
- // returns true if the parameters could be successfully modified to fit device memory
- // this function is NOT thread safe because it modifies the global llama logger state
- LLAMA_API bool llama_params_fit(
+ // - returns true if the parameters could be successfully modified to fit device memory
+ // - this function is NOT thread safe because it modifies the global llama logger state
+ // - only parameters that have the same value as in llama_default_model_params are modified
+ LLAMA_API enum llama_params_fit_status llama_params_fit(
const char * path_model,
struct llama_model_params * mparams,
struct llama_context_params * cparams,
//
// Load a LoRA adapter from file
+ // The adapter is valid as long as the associated model is not freed
+ // All adapters must be loaded before context creation
LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
struct llama_model * model,
const char * path_lora);
#include "models.h"
-llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+template <bool embed>
+llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
- auto * inp_attn = build_attn_inp_kv();
+ using inp_attn_type = std::conditional_t<embed, llm_graph_input_attn_no_cache, llm_graph_input_attn_kv>;
+
+ inp_attn_type * inp_attn = nullptr;
+ if constexpr (embed) {
+ inp_attn = build_attn_inp_no_cache();
+ } else {
+ inp_attn = build_attn_inp_kv();
+ }
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
cb(cur, "result_norm", -1);
res->t_embd = cur;
- // lm_head
- cur = build_lora_mm(model.output, cur);
+ if constexpr (!embed) {
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
- cb(cur, "result_output", -1);
- res->t_logits = cur;
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+ }
ggml_build_forward_expand(gf, cur);
}
+
+template struct llm_build_llama<false>;
+template struct llm_build_llama<true>;
--- /dev/null
+
+#include "models.h"
+
+llm_build_mimo2_iswa::llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+
+ inpL = build_inp_embd(model.tok_embd);
+
+ ggml_tensor * inp_pos = build_inp_pos();
+ auto * inp_attn = build_attn_inp_kv_iswa();
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * inpSA = inpL;
+
+ uint32_t n_head_l = hparams.n_head(il);
+ uint32_t n_head_kv_l = hparams.n_head_kv(il);
+ const float freq_base_l = model.get_rope_freq_base(cparams, il);
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+
+ cur = inpL;
+
+ // self_attention
+ {
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ // compute Q and K and RoPE them
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
+
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
+
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head_l, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv_l, n_tokens);
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv_l, n_tokens);
+
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ ggml_tensor * sinks = model.layers[il].attn_sinks;
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, sinks, nullptr, 1.0f/sqrtf(float(n_embd_head_k)), il);
+ }
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
+
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
+
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ // feed-forward network
+ if (model.layers[il].ffn_gate_inp == nullptr) {
+ // dense branch
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+ NULL,
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
+ cb(cur, "ffn_out", il);
+ } else {
+ // MoE branch
+ cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
+ model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
+ model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false,
+ 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID, il);
+ cb(cur, "ffn_moe_out", il);
+ }
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, -1);
+
+ cb(cur, "result_norm", -1);
+ res->t_embd = cur;
+
+ // lm_head
+ cur = build_lora_mm(model.output, cur);
+
+ cb(cur, "result_output", -1);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
llm_build_llada_moe(const llama_model & model, const llm_graph_params & params);
};
+template <bool embed>
struct llm_build_llama : public llm_graph_context {
llm_build_llama(const llama_model & model, const llm_graph_params & params);
};
llm_build_mamba(const llama_model & model, const llm_graph_params & params);
};
+struct llm_build_mimo2_iswa : public llm_graph_context {
+ llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params);
+};
+
struct llm_build_minicpm3 : public llm_graph_context {
llm_build_minicpm3(const llama_model & model, const llm_graph_params & params);
};
llm_build_mistral3(const llama_model & model, const llm_graph_params & params);
};
+template <bool iswa>
+struct llm_build_modern_bert : public llm_graph_context {
+ llm_build_modern_bert(const llama_model & model, const llm_graph_params & params);
+};
+
struct llm_build_mpt : public llm_graph_context {
llm_build_mpt(const llama_model & model, const llm_graph_params & params);
};
llm_build_plamo(const llama_model & model, const llm_graph_params & params);
};
+template <bool iswa>
+struct llm_build_plamo3 : public llm_graph_context {
+ llm_build_plamo3(const llama_model & model, const llm_graph_params & params);
+};
+
struct llm_build_plm : public llm_graph_context {
llm_build_plm(const llama_model & model, const llm_graph_params & params);
};
--- /dev/null
+#include "models.h"
+
+template <bool iswa>
+llm_build_modern_bert<iswa>::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL;
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ // construct input embeddings (token, type, position)
+ inpL = build_inp_embd(model.tok_embd);
+ cb(inpL, "inp_embd", -1);
+
+ // embed layer norm
+ inpL = build_norm(inpL, model.tok_norm, nullptr, LLM_NORM, -1);
+ cb(inpL, "inp_norm", -1);
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ auto * inp_attn = build_attn_inp_no_cache();
+
+ for (int il = 0; il < n_layer; ++il) {
+ float freq_base_l = 0.0f;
+
+ if constexpr (iswa) {
+ freq_base_l = model.get_rope_freq_base(cparams, il);
+ } else {
+ freq_base_l = freq_base;
+ }
+
+ cur = inpL;
+
+ // attention layer norm
+ if (model.layers[il].attn_norm) {
+ cur = build_norm(inpL,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM, il);
+ cb(cur, "attn_norm", il);
+ }
+
+ // self attention
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ const size_t type_size = ggml_type_size(cur->type);
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*type_size, cur->nb[1], 0*type_size*(n_embd));
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*type_size, cur->nb[1], 1*type_size*(n_embd));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*type_size, cur->nb[1], 1*type_size*(n_embd + n_embd_gqa));
+
+ // RoPE
+ Qcur = ggml_rope_ext(
+ ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ Kcur = ggml_rope_ext(
+ ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale,
+ ext_factor, attn_factor, beta_fast, beta_slow
+ );
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, nullptr,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+ cb(cur, "kqv_out", il);
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+ }
+
+ // re-add the layer input
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+ cb(ffn_inp, "ffn_inp", il);
+
+ // attention layer norm
+ cur = build_norm(ffn_inp,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_GEGLU, LLM_FFN_SEQ, il);
+
+ // attentions bypass the intermediate layer
+ cur = ggml_add(ctx0, cur, ffn_inp);
+
+ // input for next layer
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur,
+ model.output_norm, NULL,
+ LLM_NORM, -1);
+ cb(cur, "final_norm_out", -1);
+
+ if (hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
+ // extracting cls token
+ cur = ggml_view_1d(ctx0, cur, hparams.n_embd, 0);
+ cb(cur, "cls_pooled_embd", -1);
+ }
+
+ cb(cur, "res_embd", -1);
+ res->t_embd = cur;
+ ggml_build_forward_expand(gf, cur);
+}
+
+// Explicit template instantiations
+template struct llm_build_modern_bert<false>;
+template struct llm_build_modern_bert<true>;
--- /dev/null
+#include "models.h"
+
+template <bool iswa>
+llm_build_plamo3<iswa>::llm_build_plamo3(const llama_model & model, const llm_graph_params & params) :
+ llm_graph_context(params) {
+ const int64_t head_dim_q = hparams.n_embd_head_k;
+ const int64_t head_dim_v = hparams.n_embd_head_v;
+
+ ggml_tensor * cur;
+ ggml_tensor * inpL = build_inp_embd(model.tok_embd);
+ ggml_tensor * inp_pos = build_inp_pos();
+
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
+ inp_attn_type * inp_attn = nullptr;
+
+ if constexpr (iswa) {
+ inp_attn = build_attn_inp_kv_iswa();
+ } else {
+ inp_attn = build_attn_inp_kv();
+ }
+
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+ for (int il = 0; il < n_layer; ++il) {
+ ggml_tensor * residual = inpL;
+
+ float freq_base_l = 0.0f;
+ float freq_scale_l = 0.0f;
+ if constexpr (iswa) {
+ freq_base_l = model.get_rope_freq_base (cparams, il);
+ freq_scale_l = model.get_rope_freq_scale(cparams, il);
+ } else {
+ freq_base_l = freq_base;
+ freq_scale_l = freq_scale;
+ }
+
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_norm", il);
+
+ ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur);
+ cb(cur, "wqkv", il);
+
+ const int32_t n_head = hparams.n_head(il);
+ const int32_t n_head_kv = hparams.n_head_kv(il);
+
+ const int64_t q_offset = 0;
+ const int64_t k_offset = head_dim_q * n_head;
+ const int64_t v_offset = k_offset + head_dim_q * n_head_kv;
+
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, head_dim_q, n_head, n_tokens,
+ head_dim_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv));
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, head_dim_q, n_head_kv, n_tokens,
+ head_dim_q * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv));
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, head_dim_v, n_head_kv, n_tokens,
+ head_dim_v * sizeof(float), qkv->nb[1], v_offset * ggml_element_size(qkv));
+
+ cb(Qcur, "Qcur", il);
+ cb(Kcur, "Kcur", il);
+ cb(Vcur, "Vcur", il);
+
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+ cb(Qcur, "attn_q_norm", il);
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+ cb(Kcur, "attn_k_norm", il);
+
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr,
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+ ext_factor, attn_factor, beta_fast, beta_slow);
+
+ const float attn_scale = 1.0f / sqrtf(float(head_dim_q));
+
+ cur = build_attn(inp_attn,
+ model.layers[il].wo, NULL,
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, attn_scale, il);
+ cb(cur, "attn_out", il);
+
+ if (il == n_layer - 1 && inp_out_ids) {
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ residual = ggml_get_rows(ctx0, residual, inp_out_ids);
+ }
+
+ cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "attn_post_norm", il);
+
+ cur = ggml_add(ctx0, cur, residual);
+ cb(cur, "attn_residual", il);
+
+ residual = cur;
+
+ cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_norm", il);
+
+ cur = build_ffn(cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ NULL, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
+ cb(cur, "ffn_out", il);
+
+ cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il);
+ cb(cur, "ffn_post_norm", il);
+
+ cur = ggml_add(ctx0, cur, residual);
+ cb(cur, "ffn_residual", il);
+
+ cur = build_cvec(cur, il);
+ cb(cur, "l_out", il);
+ inpL = cur;
+ }
+
+ cur = inpL;
+
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+ res->t_embd = cur;
+
+ cur = build_lora_mm(model.output, cur);
+ res->t_logits = cur;
+
+ ggml_build_forward_expand(gf, cur);
+}
+
+// Explicit template instantiations
+template struct llm_build_plamo3<false>;
+template struct llm_build_plamo3<true>;