From: Georgi Gerganov Date: Wed, 31 Dec 2025 11:13:57 +0000 (+0200) Subject: talk-llama : sync llama.cpp X-Git-Tag: upstream/1.8.3~61 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=7359ac94d54967a39b6c24f170174a9fe09303da;p=pkg%2Fggml%2Fsources%2Fwhisper.cpp talk-llama : sync llama.cpp --- diff --git a/examples/talk-llama/llama-adapter.cpp b/examples/talk-llama/llama-adapter.cpp index d8eef75a..bdc24c2d 100644 --- a/examples/talk-llama/llama-adapter.cpp +++ b/examples/talk-llama/llama-adapter.cpp @@ -146,9 +146,11 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) { return nullptr; } -static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) { +static void llama_adapter_lora_init_impl(const char * path_lora, llama_adapter_lora & adapter) { LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora); + llama_model & model = adapter.model; + ggml_context * ctx_init; gguf_init_params meta_gguf_params = { /* .no_alloc = */ true, @@ -411,14 +413,17 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_ } } + // update number of nodes used + model.n_lora_nodes += adapter.get_n_nodes(); + LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2); } llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) { - llama_adapter_lora * adapter = new llama_adapter_lora(); + llama_adapter_lora * adapter = new llama_adapter_lora(*model); try { - llama_adapter_lora_init_impl(*model, path_lora, *adapter); + llama_adapter_lora_init_impl(path_lora, *adapter); return adapter; } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); @@ -469,6 +474,10 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter, } void llama_adapter_lora_free(llama_adapter_lora * adapter) { + // update number of nodes used + GGML_ASSERT(adapter->model.n_lora_nodes >= adapter->get_n_nodes()); + adapter->model.n_lora_nodes -= adapter->get_n_nodes(); + delete adapter; } diff --git a/examples/talk-llama/llama-adapter.h b/examples/talk-llama/llama-adapter.h index 4f65247c..42d64a6e 100644 --- a/examples/talk-llama/llama-adapter.h +++ b/examples/talk-llama/llama-adapter.h @@ -59,6 +59,8 @@ struct llama_adapter_lora_weight { }; struct llama_adapter_lora { + llama_model & model; + // map tensor name to lora_a_b std::unordered_map ab_map; @@ -73,10 +75,14 @@ struct llama_adapter_lora { // activated lora (aLoRA) std::vector alora_invocation_tokens; - llama_adapter_lora() = default; + llama_adapter_lora(llama_model & model) : model(model) {} ~llama_adapter_lora() = default; llama_adapter_lora_weight * get_weight(ggml_tensor * w); + + uint32_t get_n_nodes() const { + return ab_map.size() * 6u; // a, b, scale, add, 2 x mul_mat + } }; using llama_adapter_loras = std::unordered_map; diff --git a/examples/talk-llama/llama-arch.cpp b/examples/talk-llama/llama-arch.cpp index d0eaf317..94a6807e 100644 --- a/examples/talk-llama/llama-arch.cpp +++ b/examples/talk-llama/llama-arch.cpp @@ -20,6 +20,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_STARCODER, "starcoder" }, { LLM_ARCH_REFACT, "refact" }, { LLM_ARCH_BERT, "bert" }, + { LLM_ARCH_MODERN_BERT, "modern-bert" }, { LLM_ARCH_NOMIC_BERT, "nomic-bert" }, { LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" }, { LLM_ARCH_NEO_BERT, "neo-bert" }, @@ -41,6 +42,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_PHIMOE, "phimoe" }, { LLM_ARCH_PLAMO, "plamo" }, { LLM_ARCH_PLAMO2, "plamo2" }, + { LLM_ARCH_PLAMO3, "plamo3" }, { LLM_ARCH_CODESHELL, "codeshell" }, { LLM_ARCH_ORION, "orion" }, { LLM_ARCH_INTERNLM2, "internlm2" }, @@ -114,6 +116,8 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_RND1, "rnd1" }, { LLM_ARCH_PANGU_EMBED, "pangu-embedded" }, { LLM_ARCH_MISTRAL3, "mistral3" }, + { LLM_ARCH_MIMO2, "mimo2" }, + { LLM_ARCH_LLAMA_EMBED, "llama-embed" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -204,6 +208,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_GATE_LORA_RANK, "%s.attention.gate_lora_rank" }, { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" }, { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, + { LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, "%s.attention.sliding_window_pattern" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" }, { LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" }, @@ -214,6 +219,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" }, { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, + { LLM_KV_ROPE_FREQ_BASE_SWA, "%s.rope.freq_base_swa" }, { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, @@ -497,6 +503,7 @@ static std::set llm_get_tensor_names(llm_arch arch) { case LLM_ARCH_LLAMA: case LLM_ARCH_DECI: case LLM_ARCH_MISTRAL3: + case LLM_ARCH_LLAMA_EMBED: return { LLM_TENSOR_TOKEN_EMBD, LLM_TENSOR_OUTPUT_NORM, @@ -778,6 +785,20 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_CLS, LLM_TENSOR_CLS_OUT, }; + case LLM_ARCH_MODERN_BERT: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_TOKEN_EMBD_NORM, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_CLS, + LLM_TENSOR_CLS_OUT, + }; case LLM_ARCH_JINA_BERT_V2: return { LLM_TENSOR_TOKEN_EMBD, @@ -1057,6 +1078,22 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_ATTN_POST_NORM, LLM_TENSOR_FFN_POST_NORM, }; + case LLM_ARCH_PLAMO3: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_QKV, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_ATTN_POST_NORM, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_POST_NORM, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + }; case LLM_ARCH_CODESHELL: return { LLM_TENSOR_TOKEN_EMBD, @@ -2171,6 +2208,27 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_VISEXP_FFN_DOWN, LLM_TENSOR_VISEXP_FFN_UP, }; + case LLM_ARCH_MIMO2: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_SINKS, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_EXP_PROBS_B, + }; case LLM_ARCH_GPTJ: case LLM_ARCH_UNKNOWN: return { diff --git a/examples/talk-llama/llama-arch.h b/examples/talk-llama/llama-arch.h index 6cbf9b1f..714ead40 100644 --- a/examples/talk-llama/llama-arch.h +++ b/examples/talk-llama/llama-arch.h @@ -24,6 +24,7 @@ enum llm_arch { LLM_ARCH_STARCODER, LLM_ARCH_REFACT, LLM_ARCH_BERT, + LLM_ARCH_MODERN_BERT, LLM_ARCH_NOMIC_BERT, LLM_ARCH_NOMIC_BERT_MOE, LLM_ARCH_NEO_BERT, @@ -45,6 +46,7 @@ enum llm_arch { LLM_ARCH_PHIMOE, LLM_ARCH_PLAMO, LLM_ARCH_PLAMO2, + LLM_ARCH_PLAMO3, LLM_ARCH_CODESHELL, LLM_ARCH_ORION, LLM_ARCH_INTERNLM2, @@ -118,6 +120,8 @@ enum llm_arch { LLM_ARCH_RND1, LLM_ARCH_PANGU_EMBED, LLM_ARCH_MISTRAL3, + LLM_ARCH_MIMO2, + LLM_ARCH_LLAMA_EMBED, LLM_ARCH_UNKNOWN, }; @@ -208,6 +212,7 @@ enum llm_kv { LLM_KV_ATTENTION_GATE_LORA_RANK, LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, LLM_KV_ATTENTION_SLIDING_WINDOW, + LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, LLM_KV_ATTENTION_SCALE, LLM_KV_ATTENTION_OUTPUT_SCALE, LLM_KV_ATTENTION_TEMPERATURE_LENGTH, @@ -218,6 +223,7 @@ enum llm_kv { LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_DIMENSION_SECTIONS, LLM_KV_ROPE_FREQ_BASE, + LLM_KV_ROPE_FREQ_BASE_SWA, LLM_KV_ROPE_SCALE_LINEAR, LLM_KV_ROPE_SCALING_TYPE, LLM_KV_ROPE_SCALING_FACTOR, diff --git a/examples/talk-llama/llama-context.cpp b/examples/talk-llama/llama-context.cpp index 8786d4ee..34dfcd47 100644 --- a/examples/talk-llama/llama-context.cpp +++ b/examples/talk-llama/llama-context.cpp @@ -294,8 +294,8 @@ llama_context::llama_context( // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary bool pipeline_parallel = model.n_devices() > 1 && - model.params.n_gpu_layers > (int) model.hparams.n_layer && - model.params.split_mode == LLAMA_SPLIT_MODE_LAYER && + model.n_gpu_layers() > model.hparams.n_layer && + model.split_mode() == LLAMA_SPLIT_MODE_LAYER && cparams.offload_kqv && !model.has_tensor_overrides(); @@ -459,23 +459,22 @@ llama_context::llama_context( } llama_context::~llama_context() { - // FIXME this currently results in a use-after-free bug if the model is freed before the context - // if (!model.hparams.no_alloc) { - // for (size_t i = 0; i < backend_ptrs.size(); ++i) { - // ggml_backend_t backend = backend_ptrs[i]; - // ggml_backend_buffer_type_t buft = backend_buft[i]; - - // const size_t size_exp = backend_buf_exp_size[i]; - // const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend); - // if (size_exp == size_act) { - // LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n", - // __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0)); - // } else { - // LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n", - // __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0)); - // } - // } - // } + if (!model.hparams.no_alloc) { + for (size_t i = 0; i < backend_ptrs.size(); ++i) { + ggml_backend_t backend = backend_ptrs[i]; + ggml_backend_buffer_type_t buft = backend_buft[i]; + + const size_t size_exp = backend_buf_exp_size[i]; + const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend); + if (size_exp == size_act) { + LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n", + __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0)); + } else { + LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n", + __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0)); + } + } + } ggml_opt_free(opt_ctx); } @@ -1443,7 +1442,9 @@ uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const { if (model.arch == LLM_ARCH_QWEN3NEXT) { return std::max(n_tokens * 40, 32u * model.n_tensors()); } - return std::max(1024u, 8u*model.n_tensors()); + uint32_t res = std::max(1024u, 8u*model.n_tensors()); + res += model.n_lora_nodes; + return res; } llm_graph_result * llama_context::get_gf_res_reserve() const { @@ -1571,7 +1572,7 @@ llm_graph_cb llama_context::graph_get_cb() const { // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends // FIXME: fix in ggml_backend_sched - const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer; + const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer; if (ubatch.n_tokens < 32 || full_offload) { if (il != -1 && strcmp(name, "norm") == 0) { const auto & dev_layer = model.dev_layer(il); diff --git a/examples/talk-llama/llama-hparams.h b/examples/talk-llama/llama-hparams.h index f6e95b5d..42def73f 100644 --- a/examples/talk-llama/llama-hparams.h +++ b/examples/talk-llama/llama-hparams.h @@ -123,10 +123,11 @@ struct llama_hparams { llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE; // the size of the sliding window (0 - no SWA) uint32_t n_swa = 0; - // if swa_layers[il] == true, then layer il is SWA - // if swa_layers[il] == false, then layer il is dense (i.e. non-SWA) + // if swa_layers[il] == 1, then layer il is SWA + // if swa_layers[il] == 0, then layer il is dense (i.e. non-SWA) // by default, all layers are dense - std::array swa_layers; + // note: using uint32_t type for compatibility reason + std::array swa_layers; // for State Space Models uint32_t ssm_d_conv = 0; diff --git a/examples/talk-llama/llama-kv-cache.h b/examples/talk-llama/llama-kv-cache.h index 1868f118..0c4ed648 100644 --- a/examples/talk-llama/llama-kv-cache.h +++ b/examples/talk-llama/llama-kv-cache.h @@ -305,7 +305,7 @@ public: bool do_shift, stream_copy_info sc_info); - // used to create a batch procesing context from a batch + // used to create a batch processing context from a batch llama_kv_cache_context( llama_kv_cache * kv, slot_info_vec_t sinfos, diff --git a/examples/talk-llama/llama-mmap.cpp b/examples/talk-llama/llama-mmap.cpp index 0641c2d2..23b648a2 100644 --- a/examples/talk-llama/llama-mmap.cpp +++ b/examples/talk-llama/llama-mmap.cpp @@ -13,9 +13,10 @@ #ifdef __has_include #if __has_include() #include + #include + #include #if defined(_POSIX_MAPPED_FILES) #include - #include #endif #if defined(_POSIX_MEMLOCK_RANGE) #include @@ -74,7 +75,7 @@ struct llama_file::impl { return ret; } - impl(const char * fname, const char * mode) { + impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) { fp = ggml_fopen(fname, mode); if (fp == NULL) { throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); @@ -153,13 +154,40 @@ struct llama_file::impl { write_raw(&val, sizeof(val)); } + void read_aligned_chunk(size_t offset, void * dest, size_t size) const { + throw std::runtime_error("DirectIO is not implemented on Windows."); + } + ~impl() { if (fp) { std::fclose(fp); } } #else - impl(const char * fname, const char * mode) { + impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) { +#ifdef __linux__ + // Try unbuffered I/O for read only + if (use_direct_io && std::strcmp(mode, "rb") == 0) { + fd = open(fname, O_RDONLY | O_DIRECT); + + if (fd != -1) { + struct stat file_stats{}; + fstat(fd, &file_stats); + + size = file_stats.st_size; + alignment = file_stats.st_blksize; + + off_t ret = lseek(fd, 0, SEEK_SET); + if (ret == -1) { + throw std::runtime_error(format("seek error: %s", strerror(errno))); + } + return; + } + + LLAMA_LOG_WARN("Failed to open model %s with error: %s. Falling back to buffered I/O", + fname, strerror(errno)); + } +#endif fp = ggml_fopen(fname, mode); if (fp == NULL) { throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); @@ -170,27 +198,30 @@ struct llama_file::impl { } size_t tell() const { -// TODO: this ifdef is never true? -#ifdef _WIN32 - __int64 ret = _ftelli64(fp); -#else - long ret = std::ftell(fp); -#endif - if (ret == -1) { - throw std::runtime_error(format("ftell error: %s", strerror(errno))); + if (fd == -1) { + long ret = std::ftell(fp); + if (ret == -1) { + throw std::runtime_error(format("ftell error: %s", strerror(errno))); + } + + return (size_t) ret; } - return (size_t) ret; + off_t pos = lseek(fd, 0, SEEK_CUR); + if (pos == -1) { + throw std::runtime_error(format("lseek error: %s", strerror(errno))); + } + return (size_t) pos; } void seek(size_t offset, int whence) const { -// TODO: this ifdef is never true? -#ifdef _WIN32 - int ret = _fseeki64(fp, (__int64) offset, whence); -#else - int ret = std::fseek(fp, (long) offset, whence); -#endif - if (ret != 0) { + off_t ret = 0; + if (fd == -1) { + ret = std::fseek(fp, (long) offset, whence); + } else { + ret = lseek(fd, offset, whence); + } + if (ret == -1) { throw std::runtime_error(format("seek error: %s", strerror(errno))); } } @@ -200,13 +231,55 @@ struct llama_file::impl { return; } errno = 0; - std::size_t ret = std::fread(ptr, len, 1, fp); - if (ferror(fp)) { - throw std::runtime_error(format("read error: %s", strerror(errno))); + if (fd == -1) { + std::size_t ret = std::fread(ptr, len, 1, fp); + if (ferror(fp)) { + throw std::runtime_error(format("read error: %s", strerror(errno))); + } + if (ret != 1) { + throw std::runtime_error("unexpectedly reached end of file"); + } + } else { + bool successful = false; + while (!successful) { + off_t ret = read(fd, ptr, len); + + if (ret == -1) { + if (errno == EINTR) { + continue; // Interrupted by signal, retry + } + throw std::runtime_error(format("read error: %s", strerror(errno))); + } + if (ret == 0) { + throw std::runtime_error("unexpectedly reached end of file"); + } + + successful = true; + } } - if (ret != 1) { - throw std::runtime_error("unexpectedly reached end of file"); + } + + void read_aligned_chunk(size_t offset, void * dest, size_t size) const { + off_t aligned_offset = offset & ~(alignment - 1); + off_t offset_from_alignment = offset - aligned_offset; + size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1); + + void * raw_buffer = nullptr; + int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read); + if (ret != 0) { + throw std::runtime_error(format("posix_memalign failed with error %d", ret)); } + + struct aligned_buffer_deleter { + void operator()(void * p) const { free(p); } + }; + std::unique_ptr buffer(raw_buffer); + + seek(aligned_offset, SEEK_SET); + read_raw(buffer.get(), bytes_to_read); + + uintptr_t actual_data = reinterpret_cast(buffer.get()) + offset_from_alignment; + memcpy(dest, reinterpret_cast(actual_data), size); } uint32_t read_u32() const { @@ -231,22 +304,43 @@ struct llama_file::impl { } ~impl() { - if (fp) { + if (fd != -1) { + close(fd); + } else { std::fclose(fp); } } + int fd = -1; #endif - FILE * fp; - size_t size; + void read_raw_at(void * ptr, size_t len, size_t offset) const { + if (alignment != 1) { + read_aligned_chunk(offset, ptr, len); + } else { + seek(offset, SEEK_SET); + read_raw(ptr, len); + } + } + + size_t read_alignment() const { + return alignment; + } + + size_t alignment = 1; + + FILE * fp{}; + size_t size{}; }; -llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique(fname, mode)) {} +llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) : + pimpl(std::make_unique(fname, mode, use_direct_io)) {} llama_file::~llama_file() = default; size_t llama_file::tell() const { return pimpl->tell(); } size_t llama_file::size() const { return pimpl->size; } +size_t llama_file::read_alignment() const { return pimpl->read_alignment(); } + int llama_file::file_id() const { #ifdef _WIN32 return _fileno(pimpl->fp); @@ -261,6 +355,7 @@ int llama_file::file_id() const { void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); } void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); } +void llama_file::read_raw_at(void * ptr, size_t len, size_t offset) const { pimpl->read_raw_at(ptr, len, offset); } uint32_t llama_file::read_u32() const { return pimpl->read_u32(); } diff --git a/examples/talk-llama/llama-mmap.h b/examples/talk-llama/llama-mmap.h index 4e5aec3f..729aac16 100644 --- a/examples/talk-llama/llama-mmap.h +++ b/examples/talk-llama/llama-mmap.h @@ -3,6 +3,7 @@ #include #include #include +#include struct llama_file; struct llama_mmap; @@ -13,7 +14,7 @@ using llama_mmaps = std::vector>; using llama_mlocks = std::vector>; struct llama_file { - llama_file(const char * fname, const char * mode); + llama_file(const char * fname, const char * mode, bool use_direct_io = false); ~llama_file(); size_t tell() const; @@ -24,11 +25,14 @@ struct llama_file { void seek(size_t offset, int whence) const; void read_raw(void * ptr, size_t len) const; + void read_raw_at(void * ptr, size_t len, size_t offset) const; + void read_aligned_chunk(size_t offset, void * dest, size_t size) const; uint32_t read_u32() const; void write_raw(const void * ptr, size_t len) const; void write_u32(uint32_t val) const; + size_t read_alignment() const; private: struct impl; std::unique_ptr pimpl; diff --git a/examples/talk-llama/llama-model-loader.cpp b/examples/talk-llama/llama-model-loader.cpp index ca2ea246..5003b4fb 100644 --- a/examples/talk-llama/llama-model-loader.cpp +++ b/examples/talk-llama/llama-model-loader.cpp @@ -462,6 +462,29 @@ namespace GGUFMeta { return get_key_or_arr(llm_kv(kid), result, n, required); } + bool llama_model_loader::get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required) { + const std::string key = llm_kv(kid); + + const int id = gguf_find_key(meta.get(), key.c_str()); + + if (id < 0) { + if (required) { + throw std::runtime_error(format("key not found in model: %s", key.c_str())); + } + return false; + } + + // throw and error if type is an array + if (gguf_get_kv_type(meta.get(), id) == GGUF_TYPE_ARRAY) { + if (required) { + throw std::runtime_error(format("expected scalar, found array for key: %s", key.c_str())); + } + return false; + } + + return get_key(key, result, required); + } + // TODO: this is not very clever - figure out something better template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); @@ -504,7 +527,7 @@ llama_model_loader::llama_model_loader( get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); llm_kv = LLM_KV(llm_arch_from_string(arch_name)); - files.emplace_back(new llama_file(fname.c_str(), "rb")); + files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap)); contexts.emplace_back(ctx); // Save tensors data offset of the main file. @@ -572,7 +595,7 @@ llama_model_loader::llama_model_loader( } } - files.emplace_back(new llama_file(fname_split, "rb")); + files.emplace_back(new llama_file(fname_split, "rb", !use_mmap)); contexts.emplace_back(ctx); // Save tensors data offset info of the shard. @@ -935,7 +958,15 @@ bool llama_model_loader::load_all_data( // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives. // NVMe raid configurations might require more / larger buffers. constexpr size_t n_buffers = 4; - constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB + + size_t alignment = 1; + for (const auto & file : files) { + alignment = std::max(file->read_alignment(), alignment); + } + + // Buffer size: balance between memory usage and I/O efficiency + // 64MB works well for NVMe drives + const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024; std::vector host_buffers; std::vector events; @@ -985,6 +1016,7 @@ bool llama_model_loader::load_all_data( // If the backend is supported, create pinned memory buffers and events for synchronisation. for (size_t idx = 0; idx < n_buffers; ++idx) { auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size); + if (!buf) { LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func, ggml_backend_dev_name(dev)); @@ -1066,9 +1098,9 @@ bool llama_model_loader::load_all_data( } } else { const auto & file = files.at(weight->idx); + if (ggml_backend_buffer_is_host(cur->buffer)) { - file->seek(weight->offs, SEEK_SET); - file->read_raw(cur->data, n_size); + file->read_raw_at(cur->data, n_size, weight->offs); if (check_tensors) { validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] { return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size)); @@ -1077,26 +1109,60 @@ bool llama_model_loader::load_all_data( } else { // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. if (upload_backend) { - file->seek(weight->offs, SEEK_SET); + size_t offset = weight->offs; + alignment = file->read_alignment(); + size_t aligned_offset = offset & ~(alignment - 1); + size_t offset_from_alignment = offset - aligned_offset; + file->seek(aligned_offset, SEEK_SET); + + // Calculate aligned read boundaries + size_t read_start = aligned_offset; + size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1); size_t bytes_read = 0; + size_t data_read = 0; // Actual tensor data copied (excluding padding) - while (bytes_read < n_size) { - size_t read_iteration = std::min(buffer_size, n_size - bytes_read); + while (bytes_read < read_end - read_start) { + size_t read_size = std::min(buffer_size, read_end - read_start - bytes_read); + // Align the destination pointer within the pinned buffer + uintptr_t ptr_dest_aligned = (reinterpret_cast(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1); + + // Wait for previous upload to complete before reusing buffer ggml_backend_event_synchronize(events[buffer_idx]); - file->read_raw(host_ptrs[buffer_idx], read_iteration); - ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration); + + // Read aligned chunk from file + file->read_raw(reinterpret_cast(ptr_dest_aligned), read_size); + + // Calculate actual data portion (excluding alignment padding) + uintptr_t ptr_data = ptr_dest_aligned; + size_t data_to_copy = read_size; + + // Skip alignment padding at start of first chunk + if (bytes_read == 0) { + ptr_data += offset_from_alignment; + data_to_copy -= offset_from_alignment; + } + + // Trim alignment padding at end of last chunk + if (aligned_offset + bytes_read + read_size > offset + n_size) { + data_to_copy -= (read_end - (offset + n_size)); + } + + // Async upload actual data to GPU + ggml_backend_tensor_set_async(upload_backend, cur, + reinterpret_cast(ptr_data), data_read, data_to_copy); ggml_backend_event_record(events[buffer_idx], upload_backend); - bytes_read += read_iteration; + data_read += data_to_copy; + bytes_read += read_size; + ++buffer_idx; buffer_idx %= n_buffers; } } else { read_buf.resize(n_size); - file->seek(weight->offs, SEEK_SET); - file->read_raw(read_buf.data(), n_size); + file->read_raw_at(read_buf.data(), n_size, weight->offs); ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); diff --git a/examples/talk-llama/llama-model-loader.h b/examples/talk-llama/llama-model-loader.h index 0380c92f..d13299ad 100644 --- a/examples/talk-llama/llama-model-loader.h +++ b/examples/talk-llama/llama-model-loader.h @@ -131,6 +131,8 @@ struct llama_model_loader { template bool get_key_or_arr(enum llm_kv kid, T & result, uint32_t n, bool required = true); + bool get_key_or_arr(enum llm_kv kid, uint32_t & result, bool required = true); + std::string get_arch_name() const; enum llm_arch get_arch() const; diff --git a/examples/talk-llama/llama-model.cpp b/examples/talk-llama/llama-model.cpp index c9a3c5df..5e664c8c 100644 --- a/examples/talk-llama/llama-model.cpp +++ b/examples/talk-llama/llama-model.cpp @@ -31,12 +31,14 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_17M: return "17M"; case LLM_TYPE_22M: return "22M"; case LLM_TYPE_33M: return "33M"; + case LLM_TYPE_47M: return "47M"; case LLM_TYPE_60M: return "60M"; case LLM_TYPE_70M: return "70M"; case LLM_TYPE_80M: return "80M"; case LLM_TYPE_109M: return "109M"; case LLM_TYPE_137M: return "137M"; case LLM_TYPE_140M: return "140M"; + case LLM_TYPE_149M: return "149M"; case LLM_TYPE_160M: return "160M"; case LLM_TYPE_190M: return "190M"; case LLM_TYPE_220M: return "220M"; @@ -46,6 +48,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_335M: return "335M"; case LLM_TYPE_350M: return "350M"; case LLM_TYPE_360M: return "360M"; + case LLM_TYPE_395M: return "395M"; case LLM_TYPE_410M: return "410M"; case LLM_TYPE_450M: return "450M"; case LLM_TYPE_475M: return "475M"; @@ -127,6 +130,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_230B_A10B: return "230B.A10B"; case LLM_TYPE_235B_A22B: return "235B.A22B"; case LLM_TYPE_300B_A47B: return "300B.A47B"; + case LLM_TYPE_310B_A15B: return "310B.A15B"; case LLM_TYPE_355B_A32B: return "355B.A32B"; case LLM_TYPE_E2B: return "E2B"; case LLM_TYPE_E4B: return "E4B"; @@ -603,7 +607,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false); - if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) { + if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON || arch == LLM_ARCH_LLAMA_EMBED) { if (hparams.n_rot != hparams.n_embd_head_k) { throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k)); } @@ -627,6 +631,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { // arch-specific KVs switch (arch) { case LLM_ARCH_LLAMA: + case LLM_ARCH_LLAMA_EMBED: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -875,6 +880,34 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_MODERN_BERT: + { + const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); + if (found_swa && hparams.n_swa > 0) { + uint32_t swa_period = 3; + hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC; + + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa); + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false); + hparams.set_swa_pattern(swa_period); + } else { + hparams.swa_type = LLAMA_SWA_TYPE_NONE; + } + + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); + ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); + + switch (hparams.n_layer) { + case 12: + type = LLM_TYPE_47M; break; // granite-embedding-small + case 22: + type = LLM_TYPE_149M; break; // modern-bert-base + case 28: + type = LLM_TYPE_395M; break; // modern-bert-large + default: type = LLM_TYPE_UNKNOWN; + } + } break; case LLM_ARCH_JINA_BERT_V2: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -1194,6 +1227,26 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false); ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false); } break; + case LLM_ARCH_PLAMO3: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); + if (found_swa && hparams.n_swa > 0) { + uint32_t swa_period = 8; + hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; + hparams.rope_freq_scale_train_swa = 1.0f; + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa); + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false); + hparams.set_swa_pattern(swa_period); + } else { + hparams.swa_type = LLAMA_SWA_TYPE_NONE; + } + + switch (hparams.n_layer) { + case 24: type = LLM_TYPE_2B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; case LLM_ARCH_GPT2: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -2307,6 +2360,22 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_MIMO2: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; + + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa); + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer); + + switch (hparams.n_layer) { + case 48: type = LLM_TYPE_310B_A15B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; default: throw std::runtime_error("unsupported model architecture"); } @@ -2329,11 +2398,11 @@ void llama_model::load_vocab(llama_model_loader & ml) { bool llama_model::load_tensors(llama_model_loader & ml) { const auto & split_mode = params.split_mode; - const auto & n_gpu_layers = params.n_gpu_layers; const auto & use_mlock = params.use_mlock; const auto & tensor_split = params.tensor_split; - const int n_layer = hparams.n_layer; + const int n_layer = hparams.n_layer; + const int n_gpu_layers = this->n_gpu_layers(); const bool use_mmap_buffer = true; @@ -2378,10 +2447,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) { if (cpu_dev == nullptr) { throw std::runtime_error(format("%s: no CPU backend found", __func__)); } - const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0); - const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1); + const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0); + const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1); auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev { - const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il); + const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il); if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) { LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa); return {cpu_dev, &pimpl->cpu_buft_list}; @@ -2621,6 +2690,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { case LLM_ARCH_GRANITE: case LLM_ARCH_GRANITE_MOE: case LLM_ARCH_MISTRAL3: + case LLM_ARCH_LLAMA_EMBED: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -3155,6 +3225,37 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0); } } break; + case LLM_ARCH_MODERN_BERT: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); + + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + + for(int i = 0; i < n_layer; ++i) { + auto& layer = layers[i]; + + if ( i != 0 ) { + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + } else{ + // layer 0 uses identity + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); + } + + + layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3 * n_embd }, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); + + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, 2 * n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + } + + cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED); + cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED); + cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED); + + } break; case LLM_ARCH_NEO_BERT: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -3747,6 +3848,44 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0); } } break; + case LLM_ARCH_PLAMO3: + { + const int64_t head_dim_q = hparams.n_embd_head_k; + const int64_t head_dim_v = hparams.n_embd_head_v; + + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + const int64_t num_attention_heads = hparams.n_head(i); + const int64_t num_key_value_heads = hparams.n_head_kv(i); + const int64_t q_proj_dim = num_attention_heads * head_dim_q; + const int64_t k_proj_dim = num_key_value_heads * head_dim_q; + const int64_t v_proj_dim = num_key_value_heads * head_dim_v; + const int64_t n_ff_cur = hparams.n_ff(i); + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), + {n_embd,q_proj_dim + k_proj_dim + v_proj_dim}, 0); + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {head_dim_q}, 0); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {head_dim_q}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {num_attention_heads * head_dim_v, n_embd}, 0); + layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, i), {n_embd}, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, i), {n_embd}, 0); + + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff_cur * 2}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff_cur, n_embd}, 0); + } + } break; case LLM_ARCH_GPT2: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -5181,9 +5320,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const int64_t n_group = hparams.ssm_n_group; const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head; - const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used; - const int64_t n_ff_shexp = hparams.n_ff_shexp; - // embeddings tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -5235,6 +5371,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); } else { if (n_expert != 0) { + const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used; + const int64_t n_ff_shexp = hparams.n_ff_shexp; + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert}, 0); layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0); @@ -6584,6 +6723,44 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { hparams.n_ff_shexp, n_embd }, 0); } } break; + case LLM_ARCH_MIMO2: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i); + uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i); + uint32_t n_head = hparams.n_head(i); + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_v * n_head, n_embd }, 0); + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + layer.attn_sinks = create_tensor(tn(LLM_TENSOR_ATTN_SINKS, "weight", i), {n_head}, TENSOR_NOT_REQUIRED); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + + // non-MoE branch + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, TENSOR_NOT_REQUIRED); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED); + + // MoE branch + int64_t n_ff_exp = hparams.n_ff_exp; + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED); + layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, TENSOR_NOT_REQUIRED); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED); + layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED); + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -6693,10 +6870,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) { if (llama_supports_gpu_offload()) { const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); - LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu); - if (n_gpu_layers > (int) hparams.n_layer) { + int n_repeating = n_gpu; + if (n_repeating > 0) { LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__); + n_repeating--; } + LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating); const int max_backend_supported_layers = hparams.n_layer + 1; const int max_offloadable_layers = hparams.n_layer + 1; @@ -6763,6 +6942,14 @@ size_t llama_model::n_devices() const { return devices.size(); } +uint32_t llama_model::n_gpu_layers() const { + return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer + 1; +} + +llama_split_mode llama_model::split_mode() const { + return params.split_mode; +} + std::map llama_model::memory_breakdown() const { std::map ret; for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) { @@ -7087,6 +7274,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, case LLM_ARCH_NOMIC_BERT_MOE: case LLM_ARCH_NEO_BERT: case LLM_ARCH_WAVTOKENIZER_DEC: + case LLM_ARCH_MODERN_BERT: case LLM_ARCH_GEMMA_EMBEDDING: case LLM_ARCH_DREAM: case LLM_ARCH_LLADA: @@ -7204,16 +7392,20 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { switch (arch) { case LLM_ARCH_LLAMA: { - llm = std::make_unique(*this, params); + llm = std::make_unique>(*this, params); } break; case LLM_ARCH_LLAMA4: { if (hparams.swa_type == LLAMA_SWA_TYPE_NONE) { - llm = std::make_unique(*this, params); + llm = std::make_unique>(*this, params); } else { llm = std::make_unique(*this, params); } } break; + case LLM_ARCH_LLAMA_EMBED: + { + llm = std::make_unique>(*this, params); + } break; case LLM_ARCH_DECI: { llm = std::make_unique(*this, params); @@ -7246,6 +7438,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_MODERN_BERT: + { + llm = std::make_unique>(*this, params); + } break; case LLM_ARCH_NEO_BERT: { llm = std::make_unique(*this, params); @@ -7335,6 +7531,14 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_PLAMO3: + { + if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { + llm = std::make_unique> (*this, params); + } else { + llm = std::make_unique>(*this, params); + } + } break; case LLM_ARCH_GPT2: { llm = std::make_unique(*this, params); @@ -7635,6 +7839,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_MIMO2: + { + llm = std::make_unique(*this, params); + } break; default: GGML_ABORT("fatal error"); } @@ -7660,7 +7868,7 @@ llama_model_params llama_model_default_params() { llama_model_params result = { /*.devices =*/ nullptr, /*.tensor_buft_overrides =*/ nullptr, - /*.n_gpu_layers =*/ 999, + /*.n_gpu_layers =*/ -1, /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, /*.main_gpu =*/ 0, /*.tensor_split =*/ nullptr, @@ -7805,6 +8013,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_ERNIE4_5: case LLM_ARCH_ERNIE4_5_MOE: case LLM_ARCH_MISTRAL3: + case LLM_ARCH_LLAMA_EMBED: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 @@ -7814,6 +8023,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_DBRX: case LLM_ARCH_BERT: case LLM_ARCH_JINA_BERT_V3: + case LLM_ARCH_MODERN_BERT: case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT_MOE: case LLM_ARCH_STABLELM: @@ -7833,6 +8043,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_PHIMOE: case LLM_ARCH_PLAMO: case LLM_ARCH_PLAMO2: + case LLM_ARCH_PLAMO3: case LLM_ARCH_GEMMA: case LLM_ARCH_GEMMA2: case LLM_ARCH_GEMMA3: @@ -7863,6 +8074,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_PANGU_EMBED: case LLM_ARCH_AFMOE: case LLM_ARCH_QWEN3NEXT: + case LLM_ARCH_MIMO2: return LLAMA_ROPE_TYPE_NEOX; case LLM_ARCH_QWEN2VL: diff --git a/examples/talk-llama/llama-model.h b/examples/talk-llama/llama-model.h index c6eb9531..f4f44a92 100644 --- a/examples/talk-llama/llama-model.h +++ b/examples/talk-llama/llama-model.h @@ -24,12 +24,14 @@ enum llm_type { LLM_TYPE_17M, LLM_TYPE_22M, LLM_TYPE_33M, + LLM_TYPE_47M, LLM_TYPE_60M, LLM_TYPE_70M, LLM_TYPE_80M, LLM_TYPE_109M, LLM_TYPE_137M, LLM_TYPE_140M, + LLM_TYPE_149M, LLM_TYPE_160M, LLM_TYPE_190M, LLM_TYPE_220M, @@ -39,6 +41,7 @@ enum llm_type { LLM_TYPE_335M, LLM_TYPE_350M, LLM_TYPE_360M, + LLM_TYPE_395M, LLM_TYPE_410M, LLM_TYPE_450M, LLM_TYPE_475M, @@ -120,6 +123,7 @@ enum llm_type { LLM_TYPE_230B_A10B, // Minimax M2 LLM_TYPE_235B_A22B, LLM_TYPE_300B_A47B, // Ernie MoE big + LLM_TYPE_310B_A15B, // /MiMo-V2-Flash LLM_TYPE_355B_A32B, // GLM-4.5 LLM_TYPE_E2B, LLM_TYPE_E4B, @@ -462,8 +466,6 @@ struct llama_model { struct ggml_tensor * dense_2_out_layers = nullptr; struct ggml_tensor * dense_3_out_layers = nullptr; - llama_model_params params; - // gguf metadata std::unordered_map gguf_kv; @@ -473,6 +475,9 @@ struct llama_model { // for quantize-stats only std::vector> tensors_by_name; + // for keeping track of extra nodes used by lora adapters + uint32_t n_lora_nodes = 0; + int64_t t_load_us = 0; int64_t t_start_us = 0; @@ -494,6 +499,9 @@ struct llama_model { size_t n_tensors() const; size_t n_devices() const; + uint32_t n_gpu_layers() const; + llama_split_mode split_mode() const; + std::map memory_breakdown() const; // total number of parameters in the model @@ -522,6 +530,8 @@ struct llama_model { ggml_cgraph * build_graph(const llm_graph_params & params) const; private: + llama_model_params params; + struct impl; std::unique_ptr pimpl; }; diff --git a/examples/talk-llama/llama-sampling.cpp b/examples/talk-llama/llama-sampling.cpp index 3f4a729b..f3891453 100644 --- a/examples/talk-llama/llama-sampling.cpp +++ b/examples/talk-llama/llama-sampling.cpp @@ -362,23 +362,39 @@ const char * llama_sampler_name(const struct llama_sampler * smpl) { } void llama_sampler_accept(struct llama_sampler * smpl, llama_token token) { + if (!smpl) { + return; + } + if (smpl->iface->accept) { smpl->iface->accept(smpl, token); } } void llama_sampler_apply(struct llama_sampler * smpl, struct llama_token_data_array * cur_p) { + if (!smpl) { + return; + } + GGML_ASSERT(smpl->iface->apply); smpl->iface->apply(smpl, cur_p); } void llama_sampler_reset(struct llama_sampler * smpl) { + if (!smpl) { + return; + } + if (smpl->iface->reset) { smpl->iface->reset(smpl); } } struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) { + if (!smpl) { + return nullptr; + } + if (smpl->iface->clone) { return smpl->iface->clone(smpl); } @@ -405,39 +421,6 @@ void llama_sampler_free(struct llama_sampler * smpl) { delete smpl; } -llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) { - const auto * logits = llama_get_logits_ith(ctx, idx); - - const llama_model * model = llama_get_model(ctx); - const llama_vocab * vocab = llama_model_get_vocab(model); - - const int n_vocab = llama_vocab_n_tokens(vocab); - - // TODO: do not allocate each time - std::vector cur; - cur.reserve(n_vocab); - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); - } - - llama_token_data_array cur_p = { - /* .data = */ cur.data(), - /* .size = */ cur.size(), - /* .selected = */ -1, - /* .sorted = */ false, - }; - - llama_sampler_apply(smpl, &cur_p); - - GGML_ASSERT(cur_p.selected >= 0 && cur_p.selected < (int32_t) cur_p.size); - - auto token = cur_p.data[cur_p.selected].id; - - llama_sampler_accept(smpl, token); - - return token; -} - // sampler chain static const char * llama_sampler_chain_name(const struct llama_sampler * /*smpl*/) { @@ -511,12 +494,56 @@ struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_param /* .ctx = */ new llama_sampler_chain { /* .params = */ params, /* .samplers = */ {}, + /* .cur = */ {}, /* .t_sample_us = */ 0, /* .n_sample = */ 0, } ); } +llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) { + const auto * logits = llama_get_logits_ith(ctx, idx); + + const llama_model * model = llama_get_model(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); + + const int n_vocab = llama_vocab_n_tokens(vocab); + + // use pre-allocated buffer from chain if available, otherwise allocate locally + std::vector * cur_ptr; + std::vector cur_local; + + if (smpl->iface == &llama_sampler_chain_i) { + auto * chain = (llama_sampler_chain *) smpl->ctx; + cur_ptr = &chain->cur; + } else { + cur_ptr = &cur_local; + } + + auto & cur = *cur_ptr; + cur.resize(n_vocab); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f}; + } + + llama_token_data_array cur_p = { + /* .data = */ cur.data(), + /* .size = */ cur.size(), + /* .selected = */ -1, + /* .sorted = */ false, + }; + + llama_sampler_apply(smpl, &cur_p); + + GGML_ASSERT(cur_p.selected >= 0 && cur_p.selected < (int32_t) cur_p.size); + + auto token = cur_p.data[cur_p.selected].id; + + llama_sampler_accept(smpl, token); + + return token; +} + void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) { auto * p = (llama_sampler_chain *) chain->ctx; p->samplers.push_back(smpl); diff --git a/examples/talk-llama/llama-sampling.h b/examples/talk-llama/llama-sampling.h index 759dd7dc..1e3de4e2 100644 --- a/examples/talk-llama/llama-sampling.h +++ b/examples/talk-llama/llama-sampling.h @@ -16,6 +16,9 @@ struct llama_sampler_chain { std::vector samplers; + // pre-allocated buffer for llama_sampler_sample to avoid repeated allocations + std::vector cur; + // timing mutable int64_t t_sample_us; diff --git a/examples/talk-llama/llama-vocab.cpp b/examples/talk-llama/llama-vocab.cpp index 7b01a2ed..cd4092ca 100644 --- a/examples/talk-llama/llama-vocab.cpp +++ b/examples/talk-llama/llama-vocab.cpp @@ -1878,7 +1878,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "jina-v2-es" || tokenizer_pre == "jina-v2-de" || tokenizer_pre == "a.x-4.0" || - tokenizer_pre == "mellum") { + tokenizer_pre == "mellum" || + tokenizer_pre == "modern-bert" ) { pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2; } else if ( tokenizer_pre == "jina-v1-en" || @@ -2528,6 +2529,13 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { for (const auto * token : {"", "", "<|endoftext|>"}) { _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false); } + } else if (_contains_any(model_name, {"modern-bert"})) { + if (token_to_id.count("[MASK]") == 0 ) { + LLAMA_LOG_WARN("%s: Mask token missing in vocab!\n", __func__); + } + else { + _set_token_attr("[MASK]", LLAMA_TOKEN_ATTR_LSTRIP, true); + } } } } diff --git a/examples/talk-llama/llama.cpp b/examples/talk-llama/llama.cpp index c8b5febe..76b3acba 100644 --- a/examples/talk-llama/llama.cpp +++ b/examples/talk-llama/llama.cpp @@ -140,6 +140,10 @@ enum layer_fraction_t { }; // this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue +class llama_params_fit_exception : public std::runtime_error { + using std::runtime_error::runtime_error; +}; + static void llama_params_fit_impl( const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams, float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides, @@ -181,12 +185,11 @@ static void llama_params_fit_impl( } } - int64_t sum_total = 0; + int64_t sum_free = 0; int64_t sum_projected_free = 0; int64_t min_projected_free = INT64_MAX; int64_t sum_projected_used = 0; int64_t sum_projected_model = 0; - int64_t sum_projected_ctx = 0; if (nd > 1) { LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__); @@ -197,12 +200,11 @@ static void llama_params_fit_impl( const int64_t projected_used = dmd.mb.total(); const int64_t projected_free = dmd.free - projected_used; - sum_total += dmd.total; + sum_free += dmd.free; sum_projected_used += projected_used; sum_projected_free += projected_free; min_projected_free = std::min(min_projected_free, projected_free); sum_projected_model += dmd.mb.model; - sum_projected_ctx += dmd.mb.context; if (nd > 1) { LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n", @@ -210,10 +212,9 @@ static void llama_params_fit_impl( projected_free >= 0 ? "surplus" : "deficit"); } } - assert(sum_total >= 0 && sum_projected_used >= 0 && sum_projected_ctx >= 0); - assert(sum_projected_used >= sum_projected_ctx); + assert(sum_free >= 0 && sum_projected_used >= 0); LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n", - __func__, sum_projected_used/MiB, sum_total/MiB); + __func__, sum_projected_used/MiB, sum_free/MiB); if (min_projected_free >= margin) { if (nd == 1) { LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n", @@ -236,9 +237,7 @@ static void llama_params_fit_impl( __func__, margin/MiB, -global_surplus/MiB); if (cparams->n_ctx == 0) { if (hp_nct > n_ctx_min) { - const int64_t bytes_per_ctx = sum_projected_ctx / hp_nct; - - int64_t memory_reduction = -global_surplus; + int64_t sum_used_target = sum_free - nd*margin_s; if (nd > 1) { // for multiple devices we need to be more conservative in terms of how much context we think can fit: // - for dense models only whole layers can be assigned to devices @@ -246,24 +245,34 @@ static void llama_params_fit_impl( // - on average we expect a waste of 0.5 layers/tensors per device // - use slightly more than the expected average for nd devices to be safe const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl); - memory_reduction += (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6); + sum_used_target -= (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6); } - uint32_t ctx_reduction = std::min(uint32_t((memory_reduction + bytes_per_ctx - 1) / bytes_per_ctx), hp_nct - n_ctx_min); - cparams->n_ctx = hp_nct - ctx_reduction; - cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend - - ctx_reduction = hp_nct - cparams->n_ctx; - memory_reduction = ctx_reduction * bytes_per_ctx; - global_surplus += memory_reduction; - LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n", - __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB); - if (global_surplus >= 0) { + int64_t sum_projected_used_min_ctx = 0; + cparams->n_ctx = n_ctx_min; + const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); + for (const auto & dmd : dmds_min_ctx) { + sum_projected_used_min_ctx += dmd.mb.total(); + } + if (sum_used_target > sum_projected_used_min_ctx) { + // linear interpolation between minimum and maximum context size: + cparams->n_ctx += (hp_nct - n_ctx_min) * (sum_used_target - sum_projected_used_min_ctx) + / (sum_projected_used - sum_projected_used_min_ctx); + cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend + + const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min); + const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx; + LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n", + __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB); if (nd == 1) { LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__); return; } LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__); + } else { + const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx; + LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n", + __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB); } } else { LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n", @@ -276,32 +285,28 @@ static void llama_params_fit_impl( } if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) { - throw std::runtime_error("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort"); + throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort"); } if (nd > 1) { if (!tensor_split) { - throw std::runtime_error("did not provide a buffer to write the tensor_split to, abort"); + throw llama_params_fit_exception("did not provide a buffer to write the tensor_split to, abort"); } if (mparams->tensor_split) { for (size_t id = 0; id < nd; id++) { if (mparams->tensor_split[id] != 0.0f) { - throw std::runtime_error("model_params::tensor_split already set by user, abort"); + throw llama_params_fit_exception("model_params::tensor_split already set by user, abort"); } } } if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) { - throw std::runtime_error("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort"); - } - if (hp_ngl < 2*nd) { - throw std::runtime_error("model has only " + std::to_string(hp_ngl) + " layers but need at least " - + std::to_string(2*nd) + " to fit memory for " + std::to_string(nd) + " devices, abort"); + throw llama_params_fit_exception("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort"); } } if (!tensor_buft_overrides) { - throw std::runtime_error("did not provide buffer to set tensor_buft_overrides, abort"); + throw llama_params_fit_exception("did not provide buffer to set tensor_buft_overrides, abort"); } if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) { - throw std::runtime_error("model_params::tensor_buft_overrides already set by user, abort"); + throw llama_params_fit_exception("model_params::tensor_buft_overrides already set by user, abort"); } // step 3: iteratively fill the back to front with "dense" layers @@ -362,8 +367,7 @@ static void llama_params_fit_impl( auto set_ngl_tensor_split_tbo = [&]( const std::vector & ngl_per_device, const std::vector & overflow_bufts, - llama_model_params & mparams, - const bool add_nonrepeating) { + llama_model_params & mparams) { mparams.n_gpu_layers = 0; for (size_t id = 0; id < nd; id++) { mparams.n_gpu_layers += ngl_per_device[id].n_layer; @@ -371,13 +375,9 @@ static void llama_params_fit_impl( tensor_split[id] = ngl_per_device[id].n_layer; } } - assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl); - uint32_t il0 = hp_ngl - mparams.n_gpu_layers; // start index for tensor buft overrides + assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1); + uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides - if (add_nonrepeating) { - mparams.n_gpu_layers += 1; - tensor_split[nd - 1] += 1; - } mparams.tensor_split = tensor_split; size_t itbo = 0; @@ -389,8 +389,8 @@ static void llama_params_fit_impl( tensor_buft_overrides[itbo].buft = nullptr; itbo++; mparams.tensor_buft_overrides = tensor_buft_overrides; - throw std::runtime_error("llama_params_fit_n_tensor_buft_overrides() == " - + std::to_string(ntbo) + " is insufficient for model\n"); + throw llama_params_fit_exception("llama_max_tensor_buft_overrides() == " + + std::to_string(ntbo) + " is insufficient for model"); } tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE); tensor_buft_overrides[itbo].buft = overflow_bufts[id]; @@ -408,10 +408,9 @@ static void llama_params_fit_impl( auto get_memory_for_layers = [&]( const char * func_name, const std::vector & ngl_per_device, - const std::vector & overflow_bufts, - const bool add_nonrepeating) -> std::vector { + const std::vector & overflow_bufts) -> std::vector { llama_model_params mparams_copy = *mparams; - set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy, add_nonrepeating); + set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy); const dmds_t dmd_nl = llama_get_device_memory_data( path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); @@ -469,9 +468,6 @@ static void llama_params_fit_impl( LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB); } - // whether for the optimal memory use we expect to load at least some MoE tensors: - const bool partial_moe = hp_nex > 0 && global_surplus_cpu_moe > 0; - std::vector overflow_bufts; // which bufts the partial layers of a device overflow to: overflow_bufts.reserve(nd); for (size_t id = 0; id < nd - 1; ++id) { @@ -480,7 +476,7 @@ static void llama_params_fit_impl( overflow_bufts.push_back(ggml_backend_cpu_buffer_type()); std::vector ngl_per_device(nd); - std::vector mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts, partial_moe); + std::vector mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts); if (hp_nex > 0) { for (size_t id = 0; id < nd; id++) { ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE; @@ -493,13 +489,14 @@ static void llama_params_fit_impl( // - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target // - check memory use of our guess, replace either the low or high bound // - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits + // - the last device has the output layer, which cannot be a partial layer if (hp_nex == 0) { LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__); } else { LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__); } for (int id = nd - 1; id >= 0; id--) { - uint32_t n_unassigned = hp_ngl; + uint32_t n_unassigned = hp_ngl + 1; for (size_t jd = id + 1; jd < nd; ++jd) { assert(n_unassigned >= ngl_per_device[jd].n_layer); n_unassigned -= ngl_per_device[jd].n_layer; @@ -508,13 +505,16 @@ static void llama_params_fit_impl( std::vector ngl_per_device_high = ngl_per_device; ngl_per_device_high[id].n_layer = n_unassigned; if (hp_nex > 0) { - ngl_per_device_high[id].n_part = ngl_per_device_high[id].n_layer; + ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1; } if (ngl_per_device_high[id].n_layer > 0) { - std::vector mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe); + std::vector mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts); if (mem_high[id] > targets[id]) { assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer); uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer; + if (hp_nex > 0 && size_t(id) == nd - 1) { + delta--; + } LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta); while (delta > 1) { uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]); @@ -526,7 +526,7 @@ static void llama_params_fit_impl( if (hp_nex) { ngl_per_device_test[id].n_part += step_size; } - const std::vector mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe); + const std::vector mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts); if (mem_test[id] <= targets[id]) { ngl_per_device = ngl_per_device_test; @@ -542,6 +542,7 @@ static void llama_params_fit_impl( } else { assert(ngl_per_device_high[id].n_layer == n_unassigned); ngl_per_device = ngl_per_device_high; + mem = mem_high; LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer); } } @@ -552,7 +553,7 @@ static void llama_params_fit_impl( __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB); } if (hp_nex == 0 || global_surplus_cpu_moe <= 0) { - set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe); + set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams); return; } @@ -575,13 +576,13 @@ static void llama_params_fit_impl( for (size_t id = 0; id <= id_dense_start; id++) { std::vector ngl_per_device_high = ngl_per_device; for (size_t jd = id_dense_start; jd < nd; jd++) { - const uint32_t n_layer_move = ngl_per_device_high[jd].n_layer; + const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1; ngl_per_device_high[id].n_layer += n_layer_move; ngl_per_device_high[jd].n_layer -= n_layer_move; ngl_per_device_high[jd].n_part = 0; } size_t id_dense_start_high = nd - 1; - std::vector mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe); + std::vector mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts); if (mem_high[id] > targets[id]) { assert(ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part); @@ -609,7 +610,7 @@ static void llama_params_fit_impl( break; } } - const std::vector mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe); + const std::vector mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts); if (mem_test[id] <= targets[id]) { ngl_per_device = ngl_per_device_test; @@ -629,13 +630,14 @@ static void llama_params_fit_impl( } } else { ngl_per_device = ngl_per_device_high; + mem = mem_high; id_dense_start = id_dense_start_high; LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n", __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start); } // try to fit at least part of one more layer - if (ngl_per_device[id_dense_start].n_layer > 0) { + if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) { std::vector ngl_per_device_test = ngl_per_device; size_t id_dense_start_test = id_dense_start; ngl_per_device_test[id_dense_start_test].n_layer--; @@ -647,8 +649,8 @@ static void llama_params_fit_impl( } ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP; LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__); - std::vector mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe); - if (mem_test[id] < targets[id]) { + std::vector mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts); + if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) { ngl_per_device = ngl_per_device_test; mem = mem_test; id_dense_start = id_dense_start_test; @@ -657,8 +659,8 @@ static void llama_params_fit_impl( ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE; LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__); - mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe); - if (mem_test[id] < targets[id]) { + mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts); + if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) { ngl_per_device = ngl_per_device_test; mem = mem_test; id_dense_start = id_dense_start_test; @@ -668,8 +670,8 @@ static void llama_params_fit_impl( } else { ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN; LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__); - mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe); - if (mem_test[id] < targets[id]) { + mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts); + if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) { ngl_per_device = ngl_per_device_test; mem = mem_test; id_dense_start = id_dense_start_test; @@ -685,25 +687,28 @@ static void llama_params_fit_impl( __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB); } - set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe); + set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams); } -bool llama_params_fit( +enum llama_params_fit_status llama_params_fit( const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams, float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides, size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) { const int64_t t0_us = llama_time_us(); - bool ok = true; + llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS; try { llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level); LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__); - } catch (const std::runtime_error & e) { + } catch (const llama_params_fit_exception & e) { LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what()); - ok = false; + status = LLAMA_PARAMS_FIT_STATUS_FAILURE; + } catch (const std::runtime_error & e) { + LLAMA_LOG_ERROR("%s: encountered an error while trying to fit params to free device memory: %s\n", __func__, e.what()); + status = LLAMA_PARAMS_FIT_STATUS_ERROR; } const int64_t t1_us = llama_time_us(); LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6); - return ok; + return status; } struct llama_sampler_chain_params llama_sampler_chain_default_params() { diff --git a/examples/talk-llama/llama.h b/examples/talk-llama/llama.h index f8629300..8b3c8a7b 100644 --- a/examples/talk-llama/llama.h +++ b/examples/talk-llama/llama.h @@ -286,7 +286,7 @@ extern "C" { // NULL-terminated list of buffer types to use for tensors that match a pattern const struct llama_model_tensor_buft_override * tensor_buft_overrides; - int32_t n_gpu_layers; // number of layers to store in VRAM + int32_t n_gpu_layers; // number of layers to store in VRAM, a negative value means all layers enum llama_split_mode split_mode; // how to split the model across multiple GPUs // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE @@ -467,10 +467,17 @@ extern "C" { // Frees all allocated memory LLAMA_API void llama_free(struct llama_context * ctx); + enum llama_params_fit_status { + LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit + LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit + LLAMA_PARAMS_FIT_STATUS_ERROR = 2, // a hard error occured, e.g. because no model could be found at the specified path + }; + // fits mparams and cparams to free device memory (assumes system memory is unlimited) - // returns true if the parameters could be successfully modified to fit device memory - // this function is NOT thread safe because it modifies the global llama logger state - LLAMA_API bool llama_params_fit( + // - returns true if the parameters could be successfully modified to fit device memory + // - this function is NOT thread safe because it modifies the global llama logger state + // - only parameters that have the same value as in llama_default_model_params are modified + LLAMA_API enum llama_params_fit_status llama_params_fit( const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams, @@ -600,6 +607,8 @@ extern "C" { // // Load a LoRA adapter from file + // The adapter is valid as long as the associated model is not freed + // All adapters must be loaded before context creation LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init( struct llama_model * model, const char * path_lora); diff --git a/examples/talk-llama/models/llama.cpp b/examples/talk-llama/models/llama.cpp index ab7fd5d0..42b5fcdf 100644 --- a/examples/talk-llama/models/llama.cpp +++ b/examples/talk-llama/models/llama.cpp @@ -1,6 +1,7 @@ #include "models.h" -llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { +template +llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -14,7 +15,14 @@ llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_para // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv(); + using inp_attn_type = std::conditional_t; + + inp_attn_type * inp_attn = nullptr; + if constexpr (embed) { + inp_attn = build_attn_inp_no_cache(); + } else { + inp_attn = build_attn_inp_kv(); + } const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; @@ -145,11 +153,16 @@ llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_para cb(cur, "result_norm", -1); res->t_embd = cur; - // lm_head - cur = build_lora_mm(model.output, cur); + if constexpr (!embed) { + // lm_head + cur = build_lora_mm(model.output, cur); - cb(cur, "result_output", -1); - res->t_logits = cur; + cb(cur, "result_output", -1); + res->t_logits = cur; + } ggml_build_forward_expand(gf, cur); } + +template struct llm_build_llama; +template struct llm_build_llama; diff --git a/examples/talk-llama/models/mimo2-iswa.cpp b/examples/talk-llama/models/mimo2-iswa.cpp new file mode 100644 index 00000000..edc87cc9 --- /dev/null +++ b/examples/talk-llama/models/mimo2-iswa.cpp @@ -0,0 +1,123 @@ + +#include "models.h" + +llm_build_mimo2_iswa::llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + ggml_tensor * inp_pos = build_inp_pos(); + auto * inp_attn = build_attn_inp_kv_iswa(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpSA = inpL; + + uint32_t n_head_l = hparams.n_head(il); + uint32_t n_head_kv_l = hparams.n_head_kv(il); + const float freq_base_l = model.get_rope_freq_base(cparams, il); + const float freq_scale_l = model.get_rope_freq_scale(cparams, il); + + cur = inpL; + + // self_attention + { + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // compute Q and K and RoPE them + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head_l, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv_l, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv_l, n_tokens); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + ggml_tensor * sinks = model.layers[il].attn_sinks; + + cur = build_attn(inp_attn, + model.layers[il].wo, NULL, + Qcur, Kcur, Vcur, nullptr, sinks, nullptr, 1.0f/sqrtf(float(n_embd_head_k)), il); + } + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + // feed-forward network + if (model.layers[il].ffn_gate_inp == nullptr) { + // dense branch + cur = build_ffn(cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } else { + // MoE branch + cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps, + model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false, + 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID, il); + cb(cur, "ffn_moe_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // lm_head + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); +} diff --git a/examples/talk-llama/models/models.h b/examples/talk-llama/models/models.h index ffb36acc..e2cd4e48 100644 --- a/examples/talk-llama/models/models.h +++ b/examples/talk-llama/models/models.h @@ -303,6 +303,7 @@ struct llm_build_llada_moe : public llm_graph_context { llm_build_llada_moe(const llama_model & model, const llm_graph_params & params); }; +template struct llm_build_llama : public llm_graph_context { llm_build_llama(const llama_model & model, const llm_graph_params & params); }; @@ -315,6 +316,10 @@ struct llm_build_mamba : public llm_graph_context_mamba { llm_build_mamba(const llama_model & model, const llm_graph_params & params); }; +struct llm_build_mimo2_iswa : public llm_graph_context { + llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params); +}; + struct llm_build_minicpm3 : public llm_graph_context { llm_build_minicpm3(const llama_model & model, const llm_graph_params & params); }; @@ -327,6 +332,11 @@ struct llm_build_mistral3 : public llm_graph_context { llm_build_mistral3(const llama_model & model, const llm_graph_params & params); }; +template +struct llm_build_modern_bert : public llm_graph_context { + llm_build_modern_bert(const llama_model & model, const llm_graph_params & params); +}; + struct llm_build_mpt : public llm_graph_context { llm_build_mpt(const llama_model & model, const llm_graph_params & params); }; @@ -396,6 +406,11 @@ struct llm_build_plamo : public llm_graph_context { llm_build_plamo(const llama_model & model, const llm_graph_params & params); }; +template +struct llm_build_plamo3 : public llm_graph_context { + llm_build_plamo3(const llama_model & model, const llm_graph_params & params); +}; + struct llm_build_plm : public llm_graph_context { llm_build_plm(const llama_model & model, const llm_graph_params & params); }; diff --git a/examples/talk-llama/models/modern-bert.cpp b/examples/talk-llama/models/modern-bert.cpp new file mode 100644 index 00000000..c7809bde --- /dev/null +++ b/examples/talk-llama/models/modern-bert.cpp @@ -0,0 +1,126 @@ +#include "models.h" + +template +llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + ggml_tensor * cur; + ggml_tensor * inpL; + ggml_tensor * inp_pos = build_inp_pos(); + + // construct input embeddings (token, type, position) + inpL = build_inp_embd(model.tok_embd); + cb(inpL, "inp_embd", -1); + + // embed layer norm + inpL = build_norm(inpL, model.tok_norm, nullptr, LLM_NORM, -1); + cb(inpL, "inp_norm", -1); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + auto * inp_attn = build_attn_inp_no_cache(); + + for (int il = 0; il < n_layer; ++il) { + float freq_base_l = 0.0f; + + if constexpr (iswa) { + freq_base_l = model.get_rope_freq_base(cparams, il); + } else { + freq_base_l = freq_base; + } + + cur = inpL; + + // attention layer norm + if (model.layers[il].attn_norm) { + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM, il); + cb(cur, "attn_norm", il); + } + + // self attention + cur = build_lora_mm(model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + const size_t type_size = ggml_type_size(cur->type); + + ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*type_size, cur->nb[1], 0*type_size*(n_embd)); + ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*type_size, cur->nb[1], 1*type_size*(n_embd)); + ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*type_size, cur->nb[1], 1*type_size*(n_embd + n_embd_gqa)); + + // RoPE + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(inp_attn, + model.layers[il].wo, nullptr, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + cb(cur, "kqv_out", il); + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + // re-add the layer input + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); + + // attention layer norm + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_GEGLU, LLM_FFN_SEQ, il); + + // attentions bypass the intermediate layer + cur = ggml_add(ctx0, cur, ffn_inp); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM, -1); + cb(cur, "final_norm_out", -1); + + if (hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) { + // extracting cls token + cur = ggml_view_1d(ctx0, cur, hparams.n_embd, 0); + cb(cur, "cls_pooled_embd", -1); + } + + cb(cur, "res_embd", -1); + res->t_embd = cur; + ggml_build_forward_expand(gf, cur); +} + +// Explicit template instantiations +template struct llm_build_modern_bert; +template struct llm_build_modern_bert; diff --git a/examples/talk-llama/models/plamo3.cpp b/examples/talk-llama/models/plamo3.cpp new file mode 100644 index 00000000..55c80646 --- /dev/null +++ b/examples/talk-llama/models/plamo3.cpp @@ -0,0 +1,128 @@ +#include "models.h" + +template +llm_build_plamo3::llm_build_plamo3(const llama_model & model, const llm_graph_params & params) : + llm_graph_context(params) { + const int64_t head_dim_q = hparams.n_embd_head_k; + const int64_t head_dim_v = hparams.n_embd_head_v; + + ggml_tensor * cur; + ggml_tensor * inpL = build_inp_embd(model.tok_embd); + ggml_tensor * inp_pos = build_inp_pos(); + + using inp_attn_type = std::conditional_t; + inp_attn_type * inp_attn = nullptr; + + if constexpr (iswa) { + inp_attn = build_attn_inp_kv_iswa(); + } else { + inp_attn = build_attn_inp_kv(); + } + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * residual = inpL; + + float freq_base_l = 0.0f; + float freq_scale_l = 0.0f; + if constexpr (iswa) { + freq_base_l = model.get_rope_freq_base (cparams, il); + freq_scale_l = model.get_rope_freq_scale(cparams, il); + } else { + freq_base_l = freq_base; + freq_scale_l = freq_scale; + } + + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + const int32_t n_head = hparams.n_head(il); + const int32_t n_head_kv = hparams.n_head_kv(il); + + const int64_t q_offset = 0; + const int64_t k_offset = head_dim_q * n_head; + const int64_t v_offset = k_offset + head_dim_q * n_head_kv; + + ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, head_dim_q, n_head, n_tokens, + head_dim_q * sizeof(float), qkv->nb[1], q_offset * ggml_element_size(qkv)); + ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, head_dim_q, n_head_kv, n_tokens, + head_dim_q * sizeof(float), qkv->nb[1], k_offset * ggml_element_size(qkv)); + ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, head_dim_v, n_head_kv, n_tokens, + head_dim_v * sizeof(float), qkv->nb[1], v_offset * ggml_element_size(qkv)); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); + cb(Qcur, "attn_q_norm", il); + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); + cb(Kcur, "attn_k_norm", il); + + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, + ext_factor, attn_factor, beta_fast, beta_slow); + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, + ext_factor, attn_factor, beta_fast, beta_slow); + + const float attn_scale = 1.0f / sqrtf(float(head_dim_q)); + + cur = build_attn(inp_attn, + model.layers[il].wo, NULL, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, attn_scale, il); + cb(cur, "attn_out", il); + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + residual = ggml_get_rows(ctx0, residual, inp_out_ids); + } + + cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "attn_post_norm", il); + + cur = ggml_add(ctx0, cur, residual); + cb(cur, "attn_residual", il); + + residual = cur; + + cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SWIGLU, LLM_FFN_SEQ, il); + cb(cur, "ffn_out", il); + + cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "ffn_post_norm", il); + + cur = ggml_add(ctx0, cur, residual); + cb(cur, "ffn_residual", il); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + res->t_embd = cur; + + cur = build_lora_mm(model.output, cur); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); +} + +// Explicit template instantiations +template struct llm_build_plamo3; +template struct llm_build_plamo3;