From: Georgi Gerganov Date: Mon, 29 Sep 2025 12:18:41 +0000 (+0300) Subject: talk-llama : sync llama.cpp X-Git-Tag: upstream/1.8.0~17 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=fcf0181ee254466eae277b351e0c36350d120cc9;p=pkg%2Fggml%2Fsources%2Fwhisper.cpp talk-llama : sync llama.cpp --- diff --git a/examples/talk-llama/llama-arch.cpp b/examples/talk-llama/llama-arch.cpp index a4d2973a..4e8d54c4 100644 --- a/examples/talk-llama/llama-arch.cpp +++ b/examples/talk-llama/llama-arch.cpp @@ -98,6 +98,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_LLADA, "llada" }, { LLM_ARCH_LLADA_MOE, "llada-moe" }, { LLM_ARCH_SEED_OSS, "seed_oss" }, + { LLM_ARCH_GROVEMOE, "grovemoe" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -125,6 +126,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" }, { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" }, { LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" }, + { LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, "%s.expert_chunk_feed_forward_length" }, { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" }, { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" }, { LLM_KV_EXPERT_COUNT, "%s.expert_count" }, @@ -133,6 +135,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" }, { LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" }, { LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" }, + { LLM_KV_EXPERT_GROUP_SCALE, "%s.expert_group_scale" }, + { LLM_KV_EXPERTS_PER_GROUP, "%s.experts_per_group" }, { LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" }, { LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" }, { LLM_KV_POOLING_TYPE, "%s.pooling_type" }, @@ -721,6 +725,7 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_CLS_OUT, "cls.output" }, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, @@ -2185,6 +2190,29 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_GROVEMOE, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + { LLM_TENSOR_FFN_GATE_CHEXPS, "blk.%d.ffn_gate_chexps" }, + { LLM_TENSOR_FFN_DOWN_CHEXPS, "blk.%d.ffn_down_chexps" }, + { LLM_TENSOR_FFN_UP_CHEXPS, "blk.%d.ffn_up_chexps" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -2317,6 +2345,9 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, + {LLM_TENSOR_FFN_DOWN_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, + {LLM_TENSOR_FFN_GATE_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, + {LLM_TENSOR_FFN_UP_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, // altup / laurel (gemma 3n) {LLM_TENSOR_PER_LAYER_TOKEN_EMBD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}}, diff --git a/examples/talk-llama/llama-arch.h b/examples/talk-llama/llama-arch.h index d181ce67..b5c6f3d7 100644 --- a/examples/talk-llama/llama-arch.h +++ b/examples/talk-llama/llama-arch.h @@ -102,6 +102,7 @@ enum llm_arch { LLM_ARCH_LLADA, LLM_ARCH_LLADA_MOE, LLM_ARCH_SEED_OSS, + LLM_ARCH_GROVEMOE, LLM_ARCH_UNKNOWN, }; @@ -129,6 +130,7 @@ enum llm_kv { LLM_KV_FEED_FORWARD_LENGTH, LLM_KV_EXPERT_FEED_FORWARD_LENGTH, LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, + LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, LLM_KV_USE_PARALLEL_RESIDUAL, LLM_KV_TENSOR_DATA_LAYOUT, LLM_KV_EXPERT_COUNT, @@ -137,6 +139,8 @@ enum llm_kv { LLM_KV_EXPERT_WEIGHTS_SCALE, LLM_KV_EXPERT_WEIGHTS_NORM, LLM_KV_EXPERT_GATING_FUNC, + LLM_KV_EXPERT_GROUP_SCALE, + LLM_KV_EXPERTS_PER_GROUP, LLM_KV_MOE_EVERY_N_LAYERS, LLM_KV_NEXTN_PREDICT_LAYERS, LLM_KV_POOLING_TYPE, @@ -301,6 +305,9 @@ enum llm_tensor { LLM_TENSOR_FFN_DOWN_SHEXP, LLM_TENSOR_FFN_GATE_SHEXP, LLM_TENSOR_FFN_UP_SHEXP, + LLM_TENSOR_FFN_DOWN_CHEXPS, + LLM_TENSOR_FFN_GATE_CHEXPS, + LLM_TENSOR_FFN_UP_CHEXPS, LLM_TENSOR_FFN_EXP_PROBS_B, LLM_TENSOR_ATTN_Q_NORM, LLM_TENSOR_ATTN_K_NORM, diff --git a/examples/talk-llama/llama-context.cpp b/examples/talk-llama/llama-context.cpp index e6f76421..d8a8b5e6 100644 --- a/examples/talk-llama/llama-context.cpp +++ b/examples/talk-llama/llama-context.cpp @@ -2027,6 +2027,21 @@ void llama_context::perf_reset() { n_reused = 0; } +std::map llama_context::memory_breakdown() const { + std::map ret; + for (const auto & buft_size : model.memory_breakdown()) { + ret[buft_size.first].model += buft_size.second; + } + for (const auto & buft_size : memory->memory_breakdown()) { + ret[buft_size.first].context += buft_size.second; + } + for (const auto & backend_ptr : backends) { + ggml_backend_t backend = backend_ptr.get(); + ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend); + } + return ret; +} + // // training // @@ -2765,6 +2780,142 @@ void llama_perf_context_reset(llama_context * ctx) { ctx->perf_reset(); } +void llama_memory_breakdown_print(const struct llama_context * ctx) { + const std::vector & devices = ctx->get_model().devices; + + std::map memory_breakdown = ctx->memory_breakdown(); + + std::vector> table_data; + table_data.reserve(devices.size()); + const std::string template_header = "%s: | %s | %s %s %s %s %s %s %s |\n"; + const std::string template_gpu = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n"; + const std::string template_other = "%s: | %s | %s %s %s = %s + %s + %s %s |\n"; + + table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"}); + + constexpr size_t MiB = 1024 * 1024; + const std::vector desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "}; + + // track seen buffer types to avoid double counting: + std::set seen_buffer_types; + + // accumulative memory breakdown for each device and for host: + std::vector mb_dev(devices.size()); + llama_memory_breakdown_data mb_host; + + for (const auto & buft_mb : memory_breakdown) { + ggml_backend_buffer_type_t buft = buft_mb.first; + const llama_memory_breakdown_data & mb = buft_mb.second; + if (ggml_backend_buft_is_host(buft)) { + mb_host.model += mb.model; + mb_host.context += mb.context; + mb_host.compute += mb.compute; + seen_buffer_types.insert(buft); + continue; + } + ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft); + if (dev) { + int i_dev = -1; + for (size_t i = 0; i < devices.size(); i++) { + if (devices[i] == dev) { + i_dev = i; + break; + } + } + if (i_dev != -1) { + mb_dev[i_dev].model += mb.model; + mb_dev[i_dev].context += mb.context; + mb_dev[i_dev].compute += mb.compute; + seen_buffer_types.insert(buft); + continue; + } + } + } + + // print memory breakdown for each device: + for (size_t i = 0; i < devices.size(); i++) { + ggml_backend_dev_t dev = devices[i]; + llama_memory_breakdown_data mb = mb_dev[i]; + + const std::string name = ggml_backend_dev_name(dev); + std::string desc = ggml_backend_dev_description(dev); + for (const std::string & prefix : desc_prefixes_strip) { + if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) { + desc = desc.substr(prefix.length()); + } + } + + size_t free, total; + ggml_backend_dev_memory(dev, &free, &total); + + const size_t self = mb.model + mb.context + mb.compute; + const size_t unaccounted = total - self - free; + + table_data.push_back({ + template_gpu, + " - " + name + " (" + desc + ")", + std::to_string(total / MiB), + std::to_string(free / MiB), + std::to_string(self / MiB), + std::to_string(mb.model / MiB), + std::to_string(mb.context / MiB), + std::to_string(mb.compute / MiB), + std::to_string(unaccounted / MiB)}); + } + + // print memory breakdown for host: + { + const size_t self = mb_host.model + mb_host.context + mb_host.compute; + table_data.push_back({ + template_other, + " - Host", + "", // total + "", // free + std::to_string(self / MiB), + std::to_string(mb_host.model / MiB), + std::to_string(mb_host.context / MiB), + std::to_string(mb_host.compute / MiB), + ""}); // unaccounted + } + + // print memory breakdown for all remaining buffer types: + for (const auto & buft_mb : memory_breakdown) { + ggml_backend_buffer_type_t buft = buft_mb.first; + const llama_memory_breakdown_data & mb = buft_mb.second; + if (seen_buffer_types.count(buft) == 1) { + continue; + } + const std::string name = ggml_backend_buft_name(buft); + const size_t self = mb.model + mb.context + mb.compute; + table_data.push_back({ + template_other, + " - " + name, + "", // total + "", // free + std::to_string(self / MiB), + std::to_string(mb.model / MiB), + std::to_string(mb.context / MiB), + std::to_string(mb.compute / MiB), + ""}); // unaccounted + seen_buffer_types.insert(buft); + } + + for (size_t j = 1; j < table_data[0].size(); j++) { + size_t max_len = 0; + for (const auto & td : table_data) { + max_len = std::max(max_len, td[j].length()); + } + for (auto & td : table_data) { + td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' '); + } + } + for (const auto & td : table_data) { + LLAMA_LOG_INFO(td[0].c_str(), + __func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(), + td[6].c_str(), td[7].c_str(), td[8].c_str()); + } +} + // // training // diff --git a/examples/talk-llama/llama-context.h b/examples/talk-llama/llama-context.h index f23aa8ee..ed6d82cb 100644 --- a/examples/talk-llama/llama-context.h +++ b/examples/talk-llama/llama-context.h @@ -17,9 +17,17 @@ class llama_batch_allocr; class llama_io_read_i; class llama_io_write_i; +// "memory" as in abstract memory for the context struct llama_memory_i; struct llama_memory_context_i; +// "memory" as in physical memory for a buffer type, in bytes +struct llama_memory_breakdown_data { + size_t model = 0; // memory allocated for the model + size_t context = 0; // memory allocated for the context + size_t compute = 0; // memory allocated for temporary compute buffers +}; + struct llama_context { // init scheduler and compute buffers, reserve worst-case graphs llama_context( @@ -144,6 +152,8 @@ struct llama_context { llama_perf_context_data perf_get_data() const; void perf_reset(); + std::map memory_breakdown() const; + // // training // diff --git a/examples/talk-llama/llama-graph.cpp b/examples/talk-llama/llama-graph.cpp index 9f2e417f..90cd885a 100644 --- a/examples/talk-llama/llama-graph.cpp +++ b/examples/talk-llama/llama-graph.cpp @@ -204,7 +204,10 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) { std::vector target_pos(n_seqs_unq, -1); std::vector target_row(n_seqs_unq, -1); - bool last = cparams.pooling_type == LLAMA_POOLING_TYPE_LAST; + const bool last = ( + cparams.pooling_type == LLAMA_POOLING_TYPE_LAST || + (cparams.pooling_type == LLAMA_POOLING_TYPE_RANK && arch == LLM_ARCH_QWEN3) // qwen3 reranking & embedding models use last token + ); for (int i = 0; i < n_tokens; ++i) { const llama_pos pos = ubatch->pos[i]; @@ -920,15 +923,29 @@ ggml_tensor * llm_graph_context::build_moe_ffn( selection_probs = logits; } + if (arch == LLM_ARCH_GROVEMOE) { + selection_probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens] + cb(selection_probs, "ffn_moe_probs_biased", il); + } + // select experts ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens] cb(selected_experts->src[0], "ffn_moe_argsort", il); cb(selected_experts, "ffn_moe_topk", il); - ggml_tensor * weights = ggml_get_rows(ctx0, - ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens] + if (arch == LLM_ARCH_GROVEMOE && n_expert != hparams.n_expert) { + // TODO: Use scalar div instead when/if implemented + ggml_tensor * f_sel = ggml_cast(ctx0, selected_experts, GGML_TYPE_F32); + selected_experts = ggml_cast(ctx0, ggml_scale(ctx0, f_sel, 1.0f / float(hparams.n_group_experts)), GGML_TYPE_I32); + probs = ggml_reshape_3d(ctx0, probs, 1, hparams.n_expert, n_tokens); + } else { + probs = ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens); + } + + ggml_tensor * weights = ggml_get_rows(ctx0, probs, selected_experts); // [1, n_expert_used, n_tokens] cb(weights, "ffn_moe_weights", il); + if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT) { weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); weights = ggml_soft_max(ctx0, weights); // [n_expert_used, n_tokens] @@ -952,6 +969,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn( cb(weights, "ffn_moe_weights_scaled", il); } + //call early so that topk-moe can be used + ggml_build_forward_expand(gf, weights); + cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens); if (weight_before_ffn) { @@ -1177,7 +1197,7 @@ ggml_tensor * llm_graph_context::build_inp_mean() const { } ggml_tensor * llm_graph_context::build_inp_cls() const { - auto inp = std::make_unique(cparams); + auto inp = std::make_unique(cparams, arch); auto & cur = inp->cls; @@ -1877,34 +1897,32 @@ void llm_graph_context::build_pooling( case LLAMA_POOLING_TYPE_RANK: { ggml_tensor * inp_cls = build_inp_cls(); - inp = ggml_get_rows(ctx0, inp, inp_cls); + cur = ggml_get_rows(ctx0, inp, inp_cls); + // classification head + // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566 if (cls) { - // classification head - // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566 - cur = ggml_mul_mat(ctx0, cls, inp); + cur = ggml_mul_mat(ctx0, cls, cur); if (cls_b) { cur = ggml_add(ctx0, cur, cls_b); } cur = ggml_tanh(ctx0, cur); + } - // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en - // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896 - if (cls_out) { - cur = ggml_mul_mat(ctx0, cls_out, cur); - if (cls_out_b) { - cur = ggml_add(ctx0, cur, cls_out_b); - } - } - } else if (cls_out) { - // Single layer classification head (direct projection) - // https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476 - cur = ggml_mul_mat(ctx0, cls_out, inp); + // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en + // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896 + // Single layer classification head (direct projection) + // https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476 + if (cls_out) { + cur = ggml_mul_mat(ctx0, cls_out, cur); if (cls_out_b) { cur = ggml_add(ctx0, cur, cls_out_b); } - } else { - GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b"); + } + + // softmax for qwen3 reranker + if (arch == LLM_ARCH_QWEN3) { + cur = ggml_soft_max(ctx0, cur); } } break; default: diff --git a/examples/talk-llama/llama-graph.h b/examples/talk-llama/llama-graph.h index ca90fdf6..34b984af 100644 --- a/examples/talk-llama/llama-graph.h +++ b/examples/talk-llama/llama-graph.h @@ -206,7 +206,7 @@ public: class llm_graph_input_cls : public llm_graph_input_i { public: - llm_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {} + llm_graph_input_cls(const llama_cparams & cparams, const llm_arch arch) : cparams(cparams), arch(arch) {} virtual ~llm_graph_input_cls() = default; void set_input(const llama_ubatch * ubatch) override; @@ -214,6 +214,7 @@ public: ggml_tensor * cls; // I32 [n_batch] const llama_cparams cparams; + const llm_arch arch; }; class llm_graph_input_rs : public llm_graph_input_i { diff --git a/examples/talk-llama/llama-hparams.h b/examples/talk-llama/llama-hparams.h index 202cbbd1..0fe4b569 100644 --- a/examples/talk-llama/llama-hparams.h +++ b/examples/talk-llama/llama-hparams.h @@ -69,10 +69,13 @@ struct llama_hparams { uint32_t n_lora_kv = 0; uint32_t n_ff_exp = 0; uint32_t n_ff_shexp = 0; + uint32_t n_ff_chexp = 0; uint32_t n_expert_shared = 0; uint32_t n_norm_groups = 0; + uint32_t n_group_experts = 0; - float expert_weights_scale = 0.0; + float expert_group_scale = 0.05f; + float expert_weights_scale = 0.0f; bool expert_weights_norm = false; uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE; uint32_t moe_every_n_layers = 0; diff --git a/examples/talk-llama/llama-kv-cache-iswa.cpp b/examples/talk-llama/llama-kv-cache-iswa.cpp index d7342914..827302e6 100644 --- a/examples/talk-llama/llama-kv-cache-iswa.cpp +++ b/examples/talk-llama/llama-kv-cache-iswa.cpp @@ -113,6 +113,14 @@ llama_pos llama_kv_cache_iswa::seq_pos_max(llama_seq_id seq_id) const { return kv_swa->seq_pos_max(seq_id); } +std::map llama_kv_cache_iswa::memory_breakdown() const { + std::map mb = kv_base->memory_breakdown(); + for (const auto & buft_size : kv_swa->memory_breakdown()) { + mb[buft_size.first] += buft_size.second; + } + return mb; +} + llama_memory_context_ptr llama_kv_cache_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) { GGML_UNUSED(embd_all); diff --git a/examples/talk-llama/llama-kv-cache-iswa.h b/examples/talk-llama/llama-kv-cache-iswa.h index 5ed134b7..70ab22f0 100644 --- a/examples/talk-llama/llama-kv-cache-iswa.h +++ b/examples/talk-llama/llama-kv-cache-iswa.h @@ -56,6 +56,8 @@ public: llama_pos seq_pos_min(llama_seq_id seq_id) const override; llama_pos seq_pos_max(llama_seq_id seq_id) const override; + std::map memory_breakdown() const override; + // state write/load void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override; diff --git a/examples/talk-llama/llama-kv-cache.cpp b/examples/talk-llama/llama-kv-cache.cpp index 885be072..816f2d5d 100644 --- a/examples/talk-llama/llama-kv-cache.cpp +++ b/examples/talk-llama/llama-kv-cache.cpp @@ -473,6 +473,14 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const { return cells.seq_pos_max(seq_id); } +std::map llama_kv_cache::memory_breakdown() const { + std::map ret; + for (const ggml_backend_buffer_ptr & buf_ptr : bufs) { + ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get()); + } + return ret; +} + llama_memory_context_ptr llama_kv_cache::init_batch( llama_batch_allocr & balloc, uint32_t n_ubatch, diff --git a/examples/talk-llama/llama-kv-cache.h b/examples/talk-llama/llama-kv-cache.h index 30de013f..85f0663d 100644 --- a/examples/talk-llama/llama-kv-cache.h +++ b/examples/talk-llama/llama-kv-cache.h @@ -121,6 +121,8 @@ public: llama_pos seq_pos_min(llama_seq_id seq_id) const override; llama_pos seq_pos_max(llama_seq_id seq_id) const override; + std::map memory_breakdown() const override; + // state write/load void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override; diff --git a/examples/talk-llama/llama-memory-hybrid.cpp b/examples/talk-llama/llama-memory-hybrid.cpp index ba61ebaa..abf65248 100644 --- a/examples/talk-llama/llama-memory-hybrid.cpp +++ b/examples/talk-llama/llama-memory-hybrid.cpp @@ -166,6 +166,14 @@ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const { return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id)); } +std::map llama_memory_hybrid::memory_breakdown() const { + std::map mb = mem_attn->memory_breakdown(); + for (const auto & buft_size : mem_recr->memory_breakdown()) { + mb[buft_size.first] += buft_size.second; + } + return mb; +} + void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const { GGML_UNUSED(flags); diff --git a/examples/talk-llama/llama-memory-hybrid.h b/examples/talk-llama/llama-memory-hybrid.h index 11a35651..558cafdf 100644 --- a/examples/talk-llama/llama-memory-hybrid.h +++ b/examples/talk-llama/llama-memory-hybrid.h @@ -68,6 +68,8 @@ public: llama_pos seq_pos_min(llama_seq_id seq_id) const override; llama_pos seq_pos_max(llama_seq_id seq_id) const override; + std::map memory_breakdown() const override; + // state write/load void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override; diff --git a/examples/talk-llama/llama-memory-recurrent.cpp b/examples/talk-llama/llama-memory-recurrent.cpp index 08716ed9..44645fcd 100644 --- a/examples/talk-llama/llama-memory-recurrent.cpp +++ b/examples/talk-llama/llama-memory-recurrent.cpp @@ -359,6 +359,14 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const { return result; } +std::map llama_memory_recurrent::memory_breakdown() const { + std::map ret; + for (const ggml_backend_buffer_ptr & buf_ptr : bufs) { + ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get()); + } + return ret; +} + llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) { do { balloc.split_reset(); diff --git a/examples/talk-llama/llama-memory-recurrent.h b/examples/talk-llama/llama-memory-recurrent.h index c4daf004..077c6e3c 100644 --- a/examples/talk-llama/llama-memory-recurrent.h +++ b/examples/talk-llama/llama-memory-recurrent.h @@ -4,6 +4,7 @@ #include "llama-graph.h" #include "llama-memory.h" +#include #include #include @@ -50,6 +51,8 @@ public: llama_pos seq_pos_min(llama_seq_id seq_id) const override; llama_pos seq_pos_max(llama_seq_id seq_id) const override; + std::map memory_breakdown() const override; + bool prepare(const std::vector & ubatches); // find a contiguous slot of memory cells and emplace the ubatch there diff --git a/examples/talk-llama/llama-memory.h b/examples/talk-llama/llama-memory.h index ccd1f073..4a157b91 100644 --- a/examples/talk-llama/llama-memory.h +++ b/examples/talk-llama/llama-memory.h @@ -2,6 +2,7 @@ #include "llama.h" +#include #include #include @@ -108,6 +109,8 @@ struct llama_memory_i { virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0; virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0; + virtual std::map memory_breakdown() const = 0; + // // state write/read // diff --git a/examples/talk-llama/llama-model.cpp b/examples/talk-llama/llama-model.cpp index 981e5708..ffd9286e 100644 --- a/examples/talk-llama/llama-model.cpp +++ b/examples/talk-llama/llama-model.cpp @@ -66,6 +66,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_1_7B: return "1.7B"; case LLM_TYPE_1_8B: return "1.8B"; case LLM_TYPE_2B: return "2B"; + case LLM_TYPE_2_6B: return "2.6B"; case LLM_TYPE_2_8B: return "2.8B"; case LLM_TYPE_2_9B: return "2.9B"; case LLM_TYPE_3B: return "3B"; @@ -674,10 +675,17 @@ void llama_model::load_hparams(llama_model_loader & ml) { } break; case LLM_ARCH_MINICPM: { + // Backward-compatible defaults for older MiniCPM GGUFs + hparams.f_embedding_scale = 12.0f; + hparams.f_residual_scale = 1.4f / sqrtf(float(hparams.n_layer)); + hparams.f_logit_scale = hparams.n_embd ? (256.0f / float(hparams.n_embd)) : 1.0f; + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale); - ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale); - ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); + + // Optional KV reads, override defaults if present in newer GGUF exports + ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, /*required=*/false); + ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, /*required=*/false); + ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, /*required=*/false); // MiniCPM uses rope by default, unlike Granite which uses it as a switch hparams.rope_finetuned = true; @@ -1977,10 +1985,11 @@ void llama_model::load_hparams(llama_model_loader & ml) { for (uint32_t il = 0; il < hparams.n_layer; ++il) { hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0; } - switch (hparams.n_embd) { - case 1024: type = LLM_TYPE_350M; break; - case 1536: type = LLM_TYPE_700M; break; - case 2048: type = LLM_TYPE_1_2B; break; + switch (hparams.n_ff()) { + case 4608: type = LLM_TYPE_350M; break; + case 6912: type = LLM_TYPE_700M; break; + case 8192: type = LLM_TYPE_1_2B; break; + case 10752: type = LLM_TYPE_2_6B; break; default: type = LLM_TYPE_UNKNOWN; } } break; @@ -2007,6 +2016,19 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_GROVEMOE: + { + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + ml.get_key(LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, hparams.n_ff_chexp); + ml.get_key(LLM_KV_EXPERT_GROUP_SCALE, hparams.expert_group_scale); + ml.get_key(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + switch (hparams.n_layer) { + case 48: type = LLM_TYPE_30B_A3B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; default: throw std::runtime_error("unsupported model architecture"); } @@ -3165,6 +3187,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } + // output rerank head + cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED); + for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; @@ -5835,6 +5860,53 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0); } } break; + case LLM_ARCH_GROVEMOE: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + // if output is NULL, init from the input tok embed + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } + + GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for GROVEMOE"); + GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for GROVEMOE"); + GGML_ASSERT(hparams.n_group_experts > 0 && "n_group_experts must be > 0 for GROVEMOE"); + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); + + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0); + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); + + // MoE branch + const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used; + const int64_t n_ff_chexp = hparams.n_ff_chexp ? hparams.n_ff_chexp : n_embd_head_k; + const int64_t n_chunk_expert = n_expert / hparams.n_group_experts; + + layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0); + + layer.ffn_gate_chexps = create_tensor(tn(LLM_TENSOR_FFN_GATE_CHEXPS, "weight", i), { n_embd, n_ff_chexp, n_chunk_expert}, 0); + layer.ffn_down_chexps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_CHEXPS, "weight", i), {n_ff_chexp, n_embd, n_chunk_expert}, 0); + layer.ffn_up_chexps = create_tensor(tn(LLM_TENSOR_FFN_UP_CHEXPS, "weight", i), { n_embd, n_ff_chexp, n_chunk_expert}, 0); + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -6003,6 +6075,14 @@ size_t llama_model::n_devices() const { return devices.size(); } +std::map llama_model::memory_breakdown() const { + std::map ret; + for (const ggml_backend_buffer_ptr & buf_ptr : pimpl->bufs) { + ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get()); + } + return ret; +} + uint64_t llama_model::n_elements() const { return pimpl->n_elements; } @@ -6166,6 +6246,13 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func)); } + if (arch == LLM_ARCH_GROVEMOE) { + LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); + LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp); + LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts); + LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale); + } + vocab.print_info(); } @@ -18851,6 +18938,156 @@ struct llm_build_smallthinker : public llm_graph_context{ } }; +struct llm_build_grovemoe : public llm_graph_context { + llm_build_grovemoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_chunk_expert = n_expert / hparams.n_group_experts; + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + auto * inp_attn = build_attn_inp_kv(); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self_attention + { + // compute Q and K and RoPE them + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); + cb(Qcur, "Qcur_normed", il); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); + cb(Kcur, "Kcur_normed", il); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(inp_attn, + model.layers[il].wo, model.layers[il].bo, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // MoE branch + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + ggml_tensor * probs = build_lora_mm(model.layers[il].ffn_gate_inp, cur); // [n_expert, n_tokens] + cb(probs, "ffn_moe_logits", il); + + ggml_tensor * moe_out = + build_moe_ffn(cur, + nullptr, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, true, + false, 0.0, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, + il, probs); + cb(moe_out, "ffn_moe_out", il); + cur = moe_out; + + // TODO: Only do the expert selection and weights once + moe_out = + build_moe_ffn(cur, + nullptr, + model.layers[il].ffn_up_chexps, + model.layers[il].ffn_gate_chexps, + model.layers[il].ffn_down_chexps, + nullptr, + n_chunk_expert, n_expert_used > n_chunk_expert ? n_chunk_expert : n_expert_used, + LLM_FFN_SILU, true, + false, 0.0, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, + il, probs); + cb(moe_out, "ffn_adj_moe_out", il); + + cur = ggml_add(ctx0, cur, ggml_scale(ctx0, moe_out, hparams.expert_group_scale)); + cb(cur, "ffn_final_moe_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // lm_head + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } +}; + llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const { llama_memory_i * res; @@ -19377,6 +19614,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { llm = std::make_unique>(*this, params); } } break; + case LLM_ARCH_GROVEMOE: + { + llm = std::make_unique(*this, params); + } break; default: GGML_ABORT("fatal error"); } @@ -19582,6 +19823,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_SMALLTHINKER: case LLM_ARCH_GLM4_MOE: case LLM_ARCH_SEED_OSS: + case LLM_ARCH_GROVEMOE: return LLAMA_ROPE_TYPE_NEOX; case LLM_ARCH_QWEN2VL: diff --git a/examples/talk-llama/llama-model.h b/examples/talk-llama/llama-model.h index b1981978..d73ce969 100644 --- a/examples/talk-llama/llama-model.h +++ b/examples/talk-llama/llama-model.h @@ -7,6 +7,7 @@ #include "llama-memory.h" #include "llama-vocab.h" +#include #include #include #include @@ -58,6 +59,7 @@ enum llm_type { LLM_TYPE_1_7B, LLM_TYPE_1_8B, LLM_TYPE_2B, + LLM_TYPE_2_6B, LLM_TYPE_2_8B, LLM_TYPE_2_9B, LLM_TYPE_3B, @@ -273,6 +275,11 @@ struct llama_layer { struct ggml_tensor * ffn_down_shexp = nullptr; struct ggml_tensor * ffn_up_shexp = nullptr; + // ff adjugate experts (chexps) + struct ggml_tensor * ffn_gate_chexps = nullptr; + struct ggml_tensor * ffn_down_chexps = nullptr; + struct ggml_tensor * ffn_up_chexps = nullptr; + // ff bias struct ggml_tensor * ffn_gate_b = nullptr; struct ggml_tensor * ffn_down_b = nullptr; // b2 @@ -452,10 +459,12 @@ struct llama_model { std::string desc() const; - size_t size() const; + size_t size() const; // file size size_t n_tensors() const; size_t n_devices() const; + std::map memory_breakdown() const; + // total number of parameters in the model uint64_t n_elements() const; diff --git a/examples/talk-llama/llama-vocab.cpp b/examples/talk-llama/llama-vocab.cpp index 8cb36661..da938af0 100644 --- a/examples/talk-llama/llama-vocab.cpp +++ b/examples/talk-llama/llama-vocab.cpp @@ -1772,7 +1772,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx); const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx); precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap); -#ifdef IS_BIG_ENDIAN +#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ // correct endiannes of data in precompiled_charsmap binary blob uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0]; *xcda_blob_size = __builtin_bswap32(*xcda_blob_size); diff --git a/examples/talk-llama/llama.h b/examples/talk-llama/llama.h index 453190e8..452d9ec5 100644 --- a/examples/talk-llama/llama.h +++ b/examples/talk-llama/llama.h @@ -1329,24 +1329,25 @@ extern "C" { // // Performance utils // - // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements. + // NOTE: Used by llama.cpp examples/tools, avoid using in third-party apps. Instead, do your own performance measurements. // struct llama_perf_context_data { - double t_start_ms; - double t_load_ms; - double t_p_eval_ms; - double t_eval_ms; - - int32_t n_p_eval; - int32_t n_eval; - int32_t n_reused; // number of times a ggml compute graph had been reused + // ms == milliseconds + double t_start_ms; // absolute start time + double t_load_ms; // time needed for loading the model + double t_p_eval_ms; // time needed for processing the prompt + double t_eval_ms; // time needed for generating tokens + + int32_t n_p_eval; // number of prompt tokens + int32_t n_eval; // number of generated tokens + int32_t n_reused; // number of times a ggml compute graph had been reused }; struct llama_perf_sampler_data { - double t_sample_ms; + double t_sample_ms; // time needed for sampling in ms - int32_t n_sample; + int32_t n_sample; // number of sampled tokens }; LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx); @@ -1358,6 +1359,9 @@ extern "C" { LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain); LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain); + // print a breakdown of per-device memory use via LLAMA_LOG: + LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx); + // // training // diff --git a/examples/talk-llama/unicode.h b/examples/talk-llama/unicode.h index 0a5fa2a7..5bd1362f 100644 --- a/examples/talk-llama/unicode.h +++ b/examples/talk-llama/unicode.h @@ -4,6 +4,7 @@ #include #include +// TODO: reimplement this structure in endian-independent way struct unicode_cpt_flags { enum { UNDEFINED = 0x0001, @@ -15,6 +16,10 @@ struct unicode_cpt_flags { SYMBOL = 0x0040, // regex: \p{S} CONTROL = 0x0080, // regex: \p{C} MASK_CATEGORIES = 0x00FF, + WHITESPACE = 0x0100, + LOWERCASE = 0x0200, + UPPERCASE = 0x0400, + NFD = 0x0800, }; // codepoint type @@ -34,11 +39,49 @@ struct unicode_cpt_flags { // decode from uint16 inline unicode_cpt_flags(const uint16_t flags = 0) { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ *reinterpret_cast(this) = flags; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + is_undefined = (flags & UNDEFINED) ? 1 : 0; + is_number = (flags & NUMBER) ? 1 : 0; + is_letter = (flags & LETTER) ? 1 : 0; + is_separator = (flags & SEPARATOR) ? 1 : 0; + is_accent_mark = (flags & ACCENT_MARK) ? 1 : 0; + is_punctuation = (flags & PUNCTUATION) ? 1 : 0; + is_symbol = (flags & SYMBOL) ? 1 : 0; + is_control = (flags & CONTROL) ? 1 : 0; + is_whitespace = (flags & WHITESPACE) ? 1 : 0; + is_lowercase = (flags & LOWERCASE) ? 1 : 0; + is_uppercase = (flags & UPPERCASE) ? 1 : 0; + is_nfd = (flags & NFD) ? 1 : 0; +#else +#error Unexpected or undefined __BYTE_ORDER__ +#endif } inline uint16_t as_uint() const { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ return *reinterpret_cast(this); +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + uint16_t result = + is_undefined * UNDEFINED + + is_number * NUMBER + + is_letter * LETTER + + is_separator * SEPARATOR + + is_accent_mark * ACCENT_MARK + + is_punctuation * PUNCTUATION + + is_symbol * SYMBOL + + is_control * CONTROL + + is_whitespace * WHITESPACE + + is_lowercase * LOWERCASE + + is_uppercase * UPPERCASE + + is_nfd * NFD + ; + + return result; +#else +#error Unexpected or undefined __BYTE_ORDER__ +#endif } inline uint16_t category_flag() const {