}
*/
+static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp) {
+ if (temp <= 0.0f) {
+ // find the token with the highest logit and set the rest to -inf
+ size_t max_i = 0;
+ float max_l = cur_p->data[0].logit;
+
+ for (size_t i = 1; i < cur_p->size; ++i) {
+ if (cur_p->data[i ].logit > max_l) {
+ cur_p->data[max_i].logit = -INFINITY;
+ max_i = i;
+ max_l = cur_p->data[i].logit;
+ } else {
+ cur_p->data[i].logit = -INFINITY;
+ }
+ }
+
+ return;
+ }
+
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ cur_p->data[i].logit /= temp;
+ }
+}
+
static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
GGML_ASSERT(cur_p->size > 0);
static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (llama_sampler_dist *) smpl->ctx;
+
+ llama_sampler_softmax_impl(cur_p);
+
cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
}
static void llama_sampler_temp_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
const auto * ctx = (llama_sampler_temp *) smpl->ctx;
- for (size_t i = 0; i < cur_p->size; ++i) {
- cur_p->data[i].logit /= ctx->temp;
- }
+
+ llama_sampler_temp_impl(cur_p, ctx->temp);
}
static struct llama_sampler * llama_sampler_temp_clone(const struct llama_sampler * smpl) {
if (ctx->delta > 0) {
const float min_temp = std::max(0.0f, ctx->temp - ctx->delta);
const float max_temp = ctx->temp + ctx->delta;
+
float exponent_val = ctx->exponent;
// no need to do anything if there is only one (or zero) candidates
#endif
// Apply the dynamically calculated temperature scaling
- for (size_t i = 0; i < cur_p->size; ++i) {
- cur_p->data[i].logit /= dyn_temp;
- }
+ llama_sampler_temp_impl(cur_p, dyn_temp);
// Re-compute softmax probabilities after scaling logits with dynamic temperature
const double max_l_double = cur_p->data[0].logit;
}
#endif
} else {
- for (size_t i = 0; i < cur_p->size; ++i) {
- cur_p->data[i].logit /= ctx->temp;
- }
+ llama_sampler_temp_impl(cur_p, ctx->temp);
}
}
};
}
+// xtc
+
+struct llama_sampler_xtc {
+ const float probability;
+ const float threshold;
+ const size_t min_keep;
+
+ const uint32_t seed;
+ uint32_t seed_cur;
+
+ std::mt19937 rng;
+};
+
+static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) {
+ return "xtc";
+}
+
+static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+ auto * ctx = (llama_sampler_xtc *) smpl->ctx;
+
+ if (ctx->probability <= 0.0f
+ || ctx->threshold > 0.5f
+ || cur_p->size < 2) {
+ return;
+ }
+
+ std::uniform_real_distribution<float> distribution(0.0f, 1.0f);
+ float chance = distribution(ctx->rng);
+ if (chance > ctx->probability) return;
+
+ // in case it's not sorted/recalculated yet
+ llama_sampler_softmax_impl(cur_p);
+
+ int pos_last = 0;
+
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ if (cur_p->data[i].p >= ctx->threshold) {
+ pos_last = i;
+ } else break;
+ }
+
+ if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) {
+ cur_p->data += pos_last;
+ cur_p->size -= pos_last;
+ }
+}
+
+static struct llama_sampler * llama_sampler_xtc_clone(const struct llama_sampler * smpl) {
+ const auto * ctx = (const llama_sampler_xtc *) smpl->ctx;
+ auto * result = llama_sampler_init_xtc(ctx->probability, ctx->threshold, ctx->min_keep, ctx->seed);
+
+ // copy the state
+ {
+ auto * result_ctx = (llama_sampler_xtc *) result->ctx;
+
+ result_ctx->rng = ctx->rng;
+ }
+
+ return result;
+}
+
+static void llama_sampler_xtc_free(struct llama_sampler * smpl) {
+ delete (llama_sampler_xtc *) smpl->ctx;
+}
+
+static void llama_sampler_xtc_reset(struct llama_sampler * smpl) {
+ auto * ctx = (llama_sampler_xtc *) smpl->ctx;
+ ctx->seed_cur = get_rng_seed(ctx->seed);
+ ctx->rng.seed(ctx->seed_cur);
+}
+
+static struct llama_sampler_i llama_sampler_xtc_i = {
+ /* .name = */ llama_sampler_xtc_name,
+ /* .accept = */ nullptr,
+ /* .apply = */ llama_sample_xtc_apply,
+ /* .reset = */ llama_sampler_xtc_reset,
+ /* .clone = */ llama_sampler_xtc_clone,
+ /* .free = */ llama_sampler_xtc_free,
+};
+
+struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) {
+ auto seed_cur = get_rng_seed(seed);
+ return new llama_sampler {
+ /* .iface = */ &llama_sampler_xtc_i,
+ /* .ctx = */ new llama_sampler_xtc {
+ /* .probability = */ p,
+ /* .threshold = */ t,
+ /* .min_keep = */ min_keep,
+ /* .seed = */ seed,
+ /* .seed_cur = */ seed_cur,
+ /* .rng = */ std::mt19937(seed_cur),
+ },
+ };
+}
+
// mirostat
struct llama_sampler_mirostat {
};
}
+// DRY
+
+struct llama_sampler_dry {
+ int32_t total_context_size;
+
+ const float dry_multiplier;
+ const float dry_base;
+ const int32_t dry_allowed_length;
+ const int32_t dry_penalty_last_n;
+
+ std::unordered_multimap<llama_token, std::vector<llama_token>> dry_processed_breakers;
+ std::vector<int> dry_repeat_count;
+ std::unordered_map<llama_token, int> dry_max_token_repeat;
+ ring_buffer<llama_token> last_tokens;
+};
+
+// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
+static void get_overlapping_token_sequences(const llama_vocab & vocab, const std::string& str, std::unordered_multimap<llama_token, std::vector<llama_token>>& token_sequences, int max_tail_len = -1) {
+ for (llama_token token_id = 0; token_id < (llama_token)vocab.n_vocab; token_id++) {
+ std::string word = llama_detokenize(vocab, {token_id}, true);
+ if (word.find(str) != std::string::npos) {
+ token_sequences.emplace(token_id, std::vector<llama_token>());
+ } else {
+ size_t word_len = word.size(), str_len = str.size();
+ size_t pos = -1;
+ while ((pos = word.find(str[0], pos + 1)) != std::string::npos) {
+ bool match = true;
+ size_t i;
+ for (i = 1; i < str_len && i + pos < word_len; ++i) {
+ if (word[pos + i] != str[i]) {
+ match = false;
+ break;
+ }
+ }
+ if (match) {
+ std::vector<llama_token> tokenization = llama_tokenize_internal(vocab, str.substr(i), false, false);
+ if (max_tail_len >= 0 && tokenization.size() > (size_t)max_tail_len) {
+ tokenization.resize(max_tail_len);
+ }
+
+ // Ensure we don't already have a duplicate matching tokenization
+ auto its = token_sequences.equal_range(token_id);
+ bool found = false;
+ for (auto it = its.first; it != its.second; ++it) {
+ if (tokenization == it->second) {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ token_sequences.emplace(token_id, tokenization);
+ }
+ }
+ }
+ }
+ }
+}
+
+static const char * llama_sampler_dry_name(const struct llama_sampler * /*smpl*/) {
+ return "dry";
+}
+
+static void llama_sampler_dry_accept(struct llama_sampler * smpl, llama_token token) {
+ auto * ctx = (llama_sampler_dry *) smpl->ctx;
+ if (ctx->dry_multiplier == 0.0f || ctx->dry_base < 1.0f || ctx->dry_penalty_last_n == 0) {
+ return;
+ }
+
+ ctx->last_tokens.push_back(token);
+}
+
+// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
+static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+ auto * ctx = (llama_sampler_dry *) smpl->ctx;
+
+ if (ctx->dry_multiplier == 0.0f || ctx->dry_base < 1.0f || ctx->dry_penalty_last_n == 0) {
+ return;
+ }
+
+ int32_t effective_dry_penalty_last_n = (ctx->dry_penalty_last_n == -1) ? ctx->total_context_size : std::max(ctx->dry_penalty_last_n, 0);
+ int last_n_repeat = std::min(std::min((int)ctx->last_tokens.size(), effective_dry_penalty_last_n), ctx->total_context_size);
+
+ if (last_n_repeat <= ctx->dry_allowed_length) {
+ return;
+ }
+
+ ctx->dry_repeat_count.assign(last_n_repeat, 0);
+ ctx->dry_max_token_repeat.clear();
+
+ // Step 1: Look for restart sequences to limit the maximum repetition length.
+ // Work backwards through the context looking for any token that begins a restart sequence.
+ //
+ // The collection `restart_sequences` is a mapping from a "head" token to all "tail"
+ // sequences that together comprise a restart sequence. This allows us to quickly check
+ // whether each token is the head of a complete sequence. Most restart sequences are actually
+ // a single token, and for these the "tail" is an empty vector.
+ //
+ // If the token is a "head", test all restart sequences that begin with this token
+ // (there will often only be one sequence for each token, but if sequences like 'aaaq1' and
+ // 'aaa1' are used as restart strings, both could start with 'aaa' when tokenized). The
+ // longest matching sequence (if any) is used to limit the maximum repetition length.
+ //
+ // Note that in the case case of a short sequence contained in a longer one, this might fail to
+ // find the smallest value for `rep_limit`. For example, if 'amniotic' and 'ni' are both used as
+ // restart sequences, 'ni' will be found first, and since it's shorter it will fail to suppress
+ // 'otic'. This is a minor issue since fully contained restart sequences are likely to be rare.
+ //
+ // This is theoretically worst-case O(N^2) for arbitrary restart sequences, which is why we
+ // have already clamped the maximum tail sequence length when generating `restart_sequences`.
+ // With clamping, this scan is O(N) in the context length.
+
+ int rep_limit = last_n_repeat;
+ for (int i = 0; i < last_n_repeat; ++i) {
+ llama_token token = ctx->last_tokens.rat(i);
+ auto its = ctx->dry_processed_breakers.equal_range(token);
+ if (its.first == ctx->dry_processed_breakers.end()) {
+ continue;
+ }
+ int longest_match = -1;
+ for (auto it = its.first; it != its.second; ++it) {
+ // Note that (*it) does not contain the head character, so seq_len will be
+ // the restart sequence length minus 1.
+ // In the common case of a single-token restart sequence, (*it) will be empty
+ // and we will trivially match.
+ int seq_len = (int)it->second.size();
+ if (seq_len > longest_match && seq_len <= (int)i) {
+ bool match = true;
+ for (int offset = 0; offset < seq_len; ++offset) {
+ // The -1 when indexing `last_tokens` is because we already matched the head.
+ if (it->second[offset] != ctx->last_tokens.rat(i - offset - 1)) {
+ match = false;
+ break;
+ }
+ }
+ if (match) {
+ longest_match = seq_len;
+ }
+ }
+ }
+ if (longest_match >= 0) {
+ // We found a restart sequence starting `i` tokens from the end and continuing for
+ // `longest_match` tokens.
+ rep_limit = i - longest_match;
+ break;
+ }
+ }
+ if (rep_limit < ctx->dry_allowed_length) {
+ return;
+ }
+
+ // Step 2: Iterate in reverse over the last N tokens of the context, using the "Z-algorithm" (in
+ // the reverse direction) to efficiently compute the positions and lengths of suffixes appearing
+ // elsewhere in the context. We limit the suffix length to `rep_limit` to respect restart sequences.
+ //
+ // This algorithm is not currently documented on Wikipedia, but there is a clear description here:
+ // https://ivanyu.me/blog/2014/10/15/z-algorithm/
+ //
+ // The code below is adapted from the public domain implementation by the same author here:
+ // https://github.com/ivanyu/string-algorithms/blob/master/z_algorithm.py
+ //
+ // Example:
+ // Last N tokens: a b c c b c y a b c
+ // Repeat counts: 0 0 3 1 0 2 0 0 0 0
+ // ^
+ // This `3` means that the last three tokens of the context (a b c) also appear here.
+ //
+ // This step is worst case O(N) since the Z-algorithm is linear, despite the appearance of nested
+ // for/while loops. This can be seen by observing that the `lt` and `rt` bounds are set after each
+ // repeated suffix is detected (i.e. after each while loop when n > 0). These bound variables
+ // ensure that the inner while loops only examine each token in the context once as the outer
+ // for loop iterates over the context.
+
+ {
+ const int last = last_n_repeat - 1;
+ int rt = 0, lt = 0;
+
+ for (int k = 1; k < last_n_repeat; ++k) {
+ if (k > rt) {
+ // If k is outside the current Z-box, do naive computation.
+ int n = 0;
+ while (n + k < last_n_repeat && ctx->last_tokens.rat(n) == ctx->last_tokens.rat(n+k)) {
+ ++n;
+ }
+ ctx->dry_repeat_count[last - k] = std::min(n, rep_limit);
+ if (n > 0) {
+ lt = k;
+ rt = k+n-1;
+ }
+ } else {
+ // If k is inside the current Z-box, consider two cases.
+
+ int p = k - lt; // Pair index.
+ int right_part_len = rt - k + 1;
+
+ if (ctx->dry_repeat_count[last - p] < right_part_len) {
+ int n = std::min(ctx->dry_repeat_count[last - p], rep_limit);
+ ctx->dry_repeat_count[last - k] = n;
+ } else {
+ int i = rt + 1;
+ while (i < last_n_repeat && ctx->last_tokens.rat(i) == ctx->last_tokens.rat(i - k)) {
+ i += 1;
+ }
+
+ int n = std::min(i - k, rep_limit);
+ ctx->dry_repeat_count[last - k] = n;
+ lt = k;
+ rt = i - 1;
+ }
+ }
+ }
+ }
+
+ // Step 3: Iterate over dry_repeat_count and last_tokens, examining the maximum repeat length
+ // that would be generated by emitting each new token that would extend a sequence.
+ //
+ // Following the same example as above:
+ // Last N tokens: a b c c b c y a b c
+ // Repeat counts: 0 0 3 1 0 2 0 0 0 0
+ //
+ // For each non-zero, look ahead one token. This token, if emitted, would extend the repetition.
+ // c: 3 -> 4 (from `a b c` to `a b c c`)
+ // b: 1 -> 2 (from `c` to `c b`)
+ // y: 2 -> 3 (from `b c` to `b c y`)
+
+ for (int i = 0; i < last_n_repeat - 1; ++i) {
+ int repeat_len = ctx->dry_repeat_count[i];
+ if (repeat_len >= ctx->dry_allowed_length) {
+ // This token ends a repeat, so the next token would continue one.
+ // By convention, the value of `repeat_len` only includes the tokens currently
+ // in the context, not the new token that would be added.
+ llama_token token = ctx->last_tokens.rat(last_n_repeat - 2 - i);
+ // Track the maximum sequence ending in this token.
+ const auto& it = ctx->dry_max_token_repeat.find(token);
+ if (it == ctx->dry_max_token_repeat.end() || it->second < repeat_len) {
+ ctx->dry_max_token_repeat[token] = repeat_len;
+ }
+ }
+ }
+
+ // Step 4: Apply logit penalties based on the maximum repeat length for relevant tokens.
+
+ // Prevent floating point overflow in `pow(penalty_base, exponent)` by clamping to `max_exponent`.
+ // Compute it from `penalty_base` and the approximate log of `std::numeric_limits<float>::max()`
+ const float FLOAT_MAX_LOG = 88.7228391f;
+ int max_exponent = 0;
+ if (ctx->dry_base > 1.000001f) {
+ max_exponent = FLOAT_MAX_LOG / std::log(ctx->dry_base);
+ }
+
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ const auto& af_kvp = ctx->dry_max_token_repeat.find(cur_p->data[i].id);
+ if (af_kvp != ctx->dry_max_token_repeat.end()) {
+ // Check all sequence breakers starting with this token
+ auto range = ctx->dry_processed_breakers.equal_range(cur_p->data[i].id);
+ bool is_single_token_breaker = false;
+
+ for (auto it = range.first; it != range.second; ++it) {
+ if (it->second.empty()) {
+ is_single_token_breaker = true;
+ break;
+ }
+ }
+
+ // Apply penalty only if it's not a single-token sequence breaker
+ if (!is_single_token_breaker) {
+ int repeat_exp = af_kvp->second - ctx->dry_allowed_length;
+ if (max_exponent > 0 && repeat_exp > max_exponent) {
+ repeat_exp = max_exponent;
+ }
+ float penalty = ctx->dry_multiplier * std::pow(ctx->dry_base, repeat_exp);
+ cur_p->data[i].logit -= penalty;
+ }
+ }
+ }
+
+ cur_p->sorted = false;
+}
+
+static void llama_sampler_dry_reset(struct llama_sampler * smpl) {
+ auto * ctx = (llama_sampler_dry *) smpl->ctx;
+ ctx->last_tokens.clear();
+ ctx->dry_repeat_count.clear();
+ ctx->dry_max_token_repeat.clear();
+}
+
+static struct llama_sampler * llama_sampler_dry_clone(const struct llama_sampler * smpl) {
+ const auto * ctx = (llama_sampler_dry *) smpl->ctx;
+
+ // nullptr is passed as vocab because it is only needed for raw sequence breaker processing, which we have already done and will be copying
+ auto * result = llama_sampler_init_dry(nullptr, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
+ // Copy the state, including the processed breakers
+ {
+ auto * result_ctx = (llama_sampler_dry *) result->ctx;
+ result_ctx->dry_processed_breakers = ctx->dry_processed_breakers;
+ result_ctx->dry_repeat_count = ctx->dry_repeat_count;
+ result_ctx->dry_max_token_repeat = ctx->dry_max_token_repeat;
+ result_ctx->last_tokens = ctx->last_tokens;
+ }
+
+ return result;
+}
+
+static void llama_sampler_dry_free(struct llama_sampler * smpl) {
+ delete (llama_sampler_dry *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_dry_i = {
+ /* .name = */ llama_sampler_dry_name,
+ /* .accept = */ llama_sampler_dry_accept,
+ /* .apply = */ llama_sampler_dry_apply,
+ /* .reset = */ llama_sampler_dry_reset,
+ /* .clone = */ llama_sampler_dry_clone,
+ /* .free = */ llama_sampler_dry_free,
+};
+
+struct llama_sampler * llama_sampler_init_dry_impl(const struct llama_vocab & vocab, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
+ int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? context_size : std::max(dry_penalty_last_n, 0);
+ std::unordered_multimap<llama_token, std::vector<llama_token>> processed_breakers;
+ const int MAX_CHAR_LEN = 40;
+ const int MAX_SEQ_LEN = 20;
+
+ const bool dry_enabled = (dry_multiplier != 0.0f && dry_base >= 1.0f && dry_penalty_last_n != 0);
+
+ if (dry_enabled && seq_breakers != nullptr && num_breakers > 0) {
+ // Process sequence breakers
+ for (size_t i = 0; i < num_breakers; ++i) {
+ if (seq_breakers[i] == nullptr || std::strlen(seq_breakers[i]) == 0) {
+ LLAMA_LOG_WARN("skipping null or empty DRY sequence breaker at index %zu\n", i);
+ continue;
+ }
+
+ std::string sequence_break(seq_breakers[i]);
+ if (sequence_break.empty()) {
+ LLAMA_LOG_WARN("skipping empty DRY sequence breaker\n");
+ continue;
+ }
+
+ if (sequence_break.size() > MAX_CHAR_LEN) {
+ LLAMA_LOG_WARN("truncating DRY sequence breaker to %d characters\n", MAX_CHAR_LEN);
+ sequence_break.resize(MAX_CHAR_LEN);
+ }
+
+ get_overlapping_token_sequences(vocab, sequence_break, processed_breakers, MAX_SEQ_LEN);
+ }
+ }
+
+ return new llama_sampler {
+ /* .iface = */ &llama_sampler_dry_i,
+ /* .ctx = */ new llama_sampler_dry {
+ /* .total_context_size = */ context_size,
+ /* .dry_multiplier = */ dry_multiplier,
+ /* .dry_base = */ dry_base,
+ /* .dry_allowed_length = */ dry_allowed_length,
+ /* .dry_penalty_last_n = */ dry_penalty_last_n,
+ /* .dry_processed_breakers = */ std::move(processed_breakers),
+ /* .dry_repeat_count = */ dry_enabled ? std::vector<int>(effective_dry_penalty_last_n, 0) : std::vector<int>{},
+ /* .dry_max_token_repeat = */ {},
+ /* .last_tokens = */ dry_enabled ? ring_buffer<llama_token>(effective_dry_penalty_last_n) : ring_buffer<llama_token>(0),
+ },
+ };
+}
+
+// wrapper for test-sampling.cpp
+struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector<std::vector<llama_token>>& seq_breakers) {
+ llama_vocab dummy_vocab;
+ auto * result = llama_sampler_init_dry_impl(dummy_vocab, context_size, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, NULL, 0);
+ auto * ctx = (llama_sampler_dry *) result->ctx;
+
+ // Process the token-based sequence breakers
+ ctx->dry_processed_breakers.clear();
+ if (seq_breakers.empty()) {
+ LLAMA_LOG_WARN("empty DRY sequence breakers list in llama_sampler_init_dry_testing\n");
+ } else {
+ for (const auto& breaker : seq_breakers) {
+ if (breaker.empty()) {
+ LLAMA_LOG_WARN("skipping DRY empty sequence breaker\n");
+ continue;
+ }
+ llama_token head_token = breaker[0];
+ std::vector<llama_token> tail_tokens(breaker.begin() + 1, breaker.end());
+ ctx->dry_processed_breakers.emplace(head_token, std::move(tail_tokens));
+ }
+
+ if (ctx->dry_processed_breakers.empty()) {
+ LLAMA_LOG_WARN("no valid DRY sequence breakers processed in llama_sampler_init_dry_testing\n");
+ }
+ }
+
+ return result;
+}
+
// logit-bias
struct llama_sampler_logit_bias {
};
}
+// infill
+
+//#define GGML_DEBUG_SAMPLER_INFILL
+
+struct llama_sampler_infill {
+ const struct llama_vocab * vocab;
+
+ std::vector<char> buf0;
+ std::vector<char> buf1;
+};
+
+static const char * llama_sampler_infill_name(const struct llama_sampler * /*smpl*/) {
+ return "infill";
+}
+
+static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+ auto * ctx = (llama_sampler_infill *) smpl->ctx;
+
+ llama_sampler_softmax_impl(cur_p);
+
+#if defined(GGML_DEBUG_SAMPLER_INFILL)
+#define LOG_DBG_CUR LLAMA_LOG_DEBUG
+#else
+#define LOG_DBG_CUR(...)
+#endif
+
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
+ }
+
+ float p_txt_sum = 0.0f;
+ float p_eog_sum = 0.0f;
+
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
+ p_eog_sum += cur_p->data[i].p;
+ } else {
+ p_txt_sum += cur_p->data[i].p;
+ }
+ }
+
+ const float rat = p_eog_sum == 0.0 ? INFINITY : p_txt_sum / p_eog_sum; GGML_UNUSED(rat);
+
+ LOG_DBG_CUR("%s: p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", __func__, p_txt_sum, p_eog_sum, rat, cur_p->size);
+
+ if (3*p_eog_sum*cur_p->size > p_txt_sum) {
+ LOG_DBG_CUR("%s: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", __func__, p_txt_sum/p_eog_sum);
+
+ // keep just the EOG tokens
+ const auto size_org = cur_p->size;
+
+ cur_p->size = 0;
+
+ float p_sum = 0.0f;
+
+ for (size_t i = 0; i < size_org; ++i) {
+ if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
+ p_sum += cur_p->data[i].p;
+
+ cur_p->data[cur_p->size++] = cur_p->data[i];
+ }
+ }
+
+ // normalize probs
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ cur_p->data[i].p /= p_sum;
+ }
+
+ return;
+ }
+
+ size_t n_combined = 0; GGML_UNUSED(n_combined);
+
+ // combine tokens with common prefix
+ for (size_t i0 = 0; i0 < cur_p->size; ++i0) {
+ for (size_t i1 = 0; i1 < cur_p->size; ++i1) {
+ if (cur_p->data[i0].logit == -INFINITY) {
+ break;
+ }
+
+ if (i0 == i1 || cur_p->data[i1].logit == -INFINITY) {
+ continue;
+ }
+
+ int len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
+ if (len0 < 0) {
+ ctx->buf0.resize(len0);
+ len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
+ assert(len0 > 0);
+ }
+
+ int len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
+ if (len1 < 0) {
+ ctx->buf1.resize(len1);
+ len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
+ assert(len1 > 0);
+ }
+
+ // token i0 is a prefix of token i1
+ if (len0 > 0 && len0 <= len1 && memcmp(ctx->buf0.data(), ctx->buf1.data(), len0) == 0) {
+ int dst = i0;
+ int src = i1;
+
+ // merge into the token with higher probability
+ if (cur_p->data[i1].p > cur_p->data[i0].p) {
+ std::swap(dst, src);
+ }
+
+ cur_p->data[dst].p += cur_p->data[src].p;
+ cur_p->data[src].logit = -INFINITY;
+ cur_p->data[src].p = 0.0f;
+
+ n_combined++;
+ }
+ }
+ }
+
+ size_t n_non_eog = 0;
+
+ size_t size_org = cur_p->size;
+
+ float p_sum = 0.0f;
+ float thold = 0.2f;
+
+ cur_p->size = 0;
+
+ LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold);
+
+ for (size_t i = 0; i < size_org; ++i) {
+ const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
+
+ if (cur_p->data[i].p < thold && !is_eog) {
+ continue;
+ }
+
+ if (!is_eog) {
+ ++n_non_eog;
+ }
+
+ p_sum += cur_p->data[i].p;
+
+ // keep this token
+ cur_p->data[cur_p->size++] = cur_p->data[i];
+ }
+
+ LOG_DBG_CUR("%s: n_non_eog = %zu\n", __func__, n_non_eog);
+
+ // if no non-EOG tokens are left -> reduce cur_p to single EOT token
+ if (n_non_eog == 0) {
+ cur_p->size = 1;
+ cur_p->data[0].id = llama_token_eot_impl(*ctx->vocab);
+ cur_p->data[0].logit = 1.0f;
+
+ return;
+ }
+
+ // normalize probs
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ cur_p->data[i].p /= p_sum;
+
+ LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
+ }
+
+ size_org = cur_p->size;
+ p_sum = 0.0f;
+ thold = 1.0/(n_non_eog + 1);
+
+ cur_p->size = 0;
+
+ LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold);
+
+ for (size_t i = 0; i < size_org; ++i) {
+ const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
+
+ if (cur_p->data[i].p < thold && !is_eog) {
+ continue;
+ }
+
+ p_sum += cur_p->data[i].p;
+
+ cur_p->data[cur_p->size++] = cur_p->data[i];
+ }
+
+ // normalize probs
+ for (size_t i = 0; i < cur_p->size; ++i) {
+ cur_p->data[i].p /= p_sum;
+
+ LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
+ }
+
+#undef LOG_DBG_CUR
+}
+
+static struct llama_sampler * llama_sampler_infill_clone(const struct llama_sampler * smpl) {
+ const auto * ctx = (const llama_sampler_infill *) smpl->ctx;
+ return llama_sampler_init_infill_impl(*ctx->vocab);
+}
+
+static void llama_sampler_infill_free(struct llama_sampler * smpl) {
+ delete (llama_sampler_infill *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_infill_i = {
+ /* .name = */ llama_sampler_infill_name,
+ /* .accept = */ nullptr,
+ /* .apply = */ llama_sampler_infill_apply,
+ /* .reset = */ nullptr,
+ /* .clone = */ llama_sampler_infill_clone,
+ /* .free = */ llama_sampler_infill_free,
+};
+
+struct llama_sampler * llama_sampler_init_infill_impl(
+ const struct llama_vocab & vocab) {
+ return new llama_sampler {
+ /* .iface = */ &llama_sampler_infill_i,
+ /* .ctx = */ new llama_sampler_infill {
+ /* .vocab = */ &vocab,
+ /* .buf0 = */ std::vector<char>(512),
+ /* .buf1 = */ std::vector<char>(512),
+ },
+ };
+}
+
// utils
uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
#include "ggml-alloc.h"
#include "ggml-backend.h"
-#ifdef GGML_USE_RPC
-# include "ggml-rpc.h"
-#endif
-
-#if defined(GGML_USE_VULKAN)
-# include "ggml-vulkan.h"
-#elif defined(GGML_USE_SYCL)
-# include "ggml-sycl.h"
-#elif defined(GGML_USE_KOMPUTE)
+#if defined(GGML_USE_KOMPUTE)
# include "ggml-kompute.h"
-#elif defined(GGML_USE_CANN)
-# include "ggml-cann.h"
#endif
-#ifdef GGML_USE_BLAS
-# include "ggml-blas.h"
+#ifndef __AMX_INT8__
+#undef GGML_USE_AMX
#endif
-#ifdef GGML_USE_METAL
-# include "ggml-metal.h"
+#ifdef GGML_USE_AMX
+# include "ggml-amx.h"
#endif
// TODO: replace with ggml API call
LLM_KV_TOKENIZER_MERGES,
LLM_KV_TOKENIZER_BOS_ID,
LLM_KV_TOKENIZER_EOS_ID,
+ LLM_KV_TOKENIZER_EOT_ID,
+ LLM_KV_TOKENIZER_EOM_ID,
LLM_KV_TOKENIZER_UNK_ID,
LLM_KV_TOKENIZER_SEP_ID,
LLM_KV_TOKENIZER_PAD_ID,
LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
LLM_KV_TOKENIZER_HF_JSON,
LLM_KV_TOKENIZER_RWKV,
- LLM_KV_TOKENIZER_PREFIX_ID,
- LLM_KV_TOKENIZER_SUFFIX_ID,
- LLM_KV_TOKENIZER_MIDDLE_ID,
- LLM_KV_TOKENIZER_EOT_ID,
- LLM_KV_TOKENIZER_EOM_ID,
+ LLM_KV_TOKENIZER_FIM_PRE_ID,
+ LLM_KV_TOKENIZER_FIM_SUF_ID,
+ LLM_KV_TOKENIZER_FIM_MID_ID,
+ LLM_KV_TOKENIZER_FIM_PAD_ID,
+ LLM_KV_TOKENIZER_FIM_REP_ID,
+ LLM_KV_TOKENIZER_FIM_SEP_ID,
LLM_KV_ADAPTER_TYPE,
LLM_KV_ADAPTER_LORA_ALPHA,
+
+ // deprecated:
+ LLM_KV_TOKENIZER_PREFIX_ID,
+ LLM_KV_TOKENIZER_SUFFIX_ID,
+ LLM_KV_TOKENIZER_MIDDLE_ID,
};
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
- { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
- { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
- { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
- { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
- { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
- { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
- { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
- { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
- { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
-
- { LLM_KV_SPLIT_NO, "split.no" },
- { LLM_KV_SPLIT_COUNT, "split.count" },
- { LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" },
-
- { LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" },
- { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
- { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
- { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
- { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" },
-
- { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" },
-
- { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
- { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
- { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
- { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
- { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
- { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
- { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
- { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
- { LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" },
- { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
- { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
- { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
- { LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
- { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
- { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
- { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
- { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
- { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" },
- { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
- { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
- { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
- { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
- { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
- { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
- { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
- { LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" },
-
- { LLM_KV_ADAPTER_TYPE, "adapter.type" },
- { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
+ { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
+ { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
+ { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
+ { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
+ { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
+ { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
+ { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
+ { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
+ { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
+
+ { LLM_KV_SPLIT_NO, "split.no" },
+ { LLM_KV_SPLIT_COUNT, "split.count" },
+ { LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" },
+
+ { LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" },
+ { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
+ { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
+ { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
+ { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" },
+
+ { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" },
+
+ { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
+ { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
+ { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
+ { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
+ { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
+ { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
+ { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
+ { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
+ { LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" },
+ { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
+ { LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" },
+ { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
+ { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
+ { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
+ { LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
+ { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
+ { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
+ { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
+ { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
+ { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" },
+ { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
+ { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
+ { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
+ { LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
+ { LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
+ { LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
+ { LLM_KV_TOKENIZER_FIM_PAD_ID, "tokenizer.ggml.fim_pad_token_id" },
+ { LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" },
+ { LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" },
+
+ { LLM_KV_ADAPTER_TYPE, "adapter.type" },
+ { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
+
+ // deprecated
+ { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
+ { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
+ { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
};
struct LLM_KV {
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
// ref: https://github.com/ggerganov/llama.cpp/pull/8141
- llama_token dec_start_token_id = -1;
+ llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
llama_seq_id * seq_id;
size_t offset;
size_t length;
-
- // helper for smoother batch API transition -- can be deprecated in the future
- llama_seq_id all_seq_id; // used if seq_id == NULL
};
// sequence-length-aware batch splitting
} else {
ubatch.embd = nullptr;
}
- // from here on, the else branches are deprecated;
- // they are helpers for smoother batch API transition
- if (batch->pos) {
- if (ubatch.equal_seqs) {
- for (size_t i = 0; i < length; ++i) {
- ubatch.pos[ubatch.n_tokens + i] = batch->pos[ids[seq.offset + i]];
- }
- } else {
- // simple split
- ubatch.pos = batch->pos + seq.offset;
- }
- } else {
+ if (ubatch.equal_seqs) {
for (size_t i = 0; i < length; ++i) {
- llama_pos bi = ids[seq.offset + i];
- ubatch.pos[ubatch.n_tokens + i] = batch->all_pos_0 + (bi * batch->all_pos_1);
+ ubatch.pos[ubatch.n_tokens + i] = batch->pos[ids[seq.offset + i]];
}
+ } else {
+ // simple split
+ ubatch.pos = batch->pos + seq.offset;
}
if (ubatch.equal_seqs) {
ubatch.n_seq_id[ubatch.n_seqs] = seq.n_seq_id;
if (seq.seq_id) {
ubatch.seq_id[ubatch.n_seqs] = seq.seq_id;
- } else {
- GGML_ASSERT(seq.n_seq_id == 1);
- ubatch.seq_id[ubatch.n_seqs] = &seq.all_seq_id;
}
} else {
// simple split
}
if (batch->seq_id) {
ubatch.seq_id = batch->seq_id + seq.offset;
- } else {
- for (size_t i = 0; i < length; ++i) {
- ubatch.seq_id[ubatch.n_seqs + i] = &seq.all_seq_id;
- }
}
}
if (logits_all) {
s.seq_id = nullptr;
s.offset = 0;
s.length = n_tokens;
- s.all_seq_id = batch.all_seq_id;
return;
}
std::sort(ids.begin(), ids.end(),
if (batch.pos) {
return batch.pos[a] < batch.pos[b];
}
- // no pos, sort by id (assuming batch.all_pos_1 is positive)
+ // no pos, sort by id
return a < b;
}
// shared prompts go first
// init seq
llama_sbatch_seq * last_seq = nullptr;
- if (batch.n_seq_id != nullptr && batch.seq_id != nullptr) {
- for (size_t i = 0; i < n_tokens; ++i) {
- const size_t bi = ids[i];
- const int32_t n_seqs = batch.n_seq_id[bi];
- llama_seq_id * seq_ids = batch.seq_id[bi];
- if (last_seq != nullptr) {
- bool same = n_seqs == last_seq->n_seq_id;
- for (int32_t j = 0; same && j < n_seqs; ++j) {
- if (seq_ids[j] != last_seq->seq_id[j]) {
- same = false;
- }
- }
- if (same) {
- last_seq->length += 1;
- continue;
+ for (size_t i = 0; i < n_tokens; ++i) {
+ const size_t bi = ids[i];
+ const int32_t n_seqs = batch.n_seq_id[bi];
+ llama_seq_id * seq_ids = batch.seq_id[bi];
+ if (last_seq != nullptr) {
+ bool same = n_seqs == last_seq->n_seq_id;
+ for (int32_t j = 0; same && j < n_seqs; ++j) {
+ if (seq_ids[j] != last_seq->seq_id[j]) {
+ same = false;
}
}
- llama_sbatch_seq new_seq = {n_seqs, seq_ids, i, 1, batch.all_seq_id};
- seq.push_back(new_seq);
- last_seq = &seq.back();
+ if (same) {
+ last_seq->length += 1;
+ continue;
+ }
}
- } else {
- llama_sbatch_seq new_seq = {1, nullptr, 0, n_tokens, batch.all_seq_id};
+ llama_sbatch_seq new_seq = {n_seqs, seq_ids, i, 1};
seq.push_back(new_seq);
+ last_seq = &seq.back();
}
// keep shared prompts first at the end, then sort by length descending.
std::sort(seq.begin(), seq.end(),
std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
std::vector<ggml_backend_t> backends;
-#ifdef GGML_USE_METAL
- ggml_backend_t backend_metal = nullptr;
-#endif
-#ifdef GGML_USE_BLAS
- ggml_backend_t backend_blas = nullptr;
-#endif
+ std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
+
ggml_backend_t backend_cpu = nullptr;
ggml_threadpool_t threadpool = nullptr;
count += (int) model.rpc_servers.size();
#endif
-#if defined(GGML_USE_METAL)
- count += 1;
-#elif defined(GGML_USE_SYCL)
- count += ggml_backend_sycl_get_device_count();
-#elif defined(GGML_USE_VULKAN)
- count += ggml_backend_vk_get_device_count();
-#elif defined(GGML_USE_CANN)
- count += ggml_backend_cann_get_device_count();
-#endif
-
return count;
GGML_UNUSED(model);
}
}
-#if defined(GGML_USE_SYCL)
- if (host_buffer) {
- buft = ggml_backend_sycl_host_buffer_type();
- }
-#elif defined(GGML_USE_CANN)
- if (host_buffer) {
- buft = ggml_backend_cann_host_buffer_type();
- }
-#elif defined(GGML_USE_CPU_HBM)
+#if defined(GGML_USE_CPU_HBM)
buft = ggml_backend_cpu_hbm_buffer_type();
-#elif defined(GGML_USE_VULKAN)
- if (host_buffer) {
- buft = ggml_backend_vk_host_buffer_type();
- }
#endif
if (buft == nullptr) {
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int device) {
ggml_backend_buffer_type_t buft = nullptr;
-#if defined(GGML_USE_RPC)
- int rpc_count = (int)model.rpc_servers.size();
- if (device < rpc_count) {
- const char * endpoint = model.rpc_servers[device].c_str();
- return ggml_backend_rpc_buffer_type(endpoint);
- }
- device -= rpc_count;
-#endif
-
if (device < (int)model.devices.size()) {
return ggml_backend_dev_buffer_type(model.devices[device]);
}
device -= (int)model.devices.size();
-#if defined(GGML_USE_METAL)
- buft = ggml_backend_metal_buffer_type();
-#elif defined(GGML_USE_VULKAN)
- buft = ggml_backend_vk_buffer_type(device);
-#elif defined(GGML_USE_SYCL)
- buft = ggml_backend_sycl_buffer_type(device);
-#elif defined(GGML_USE_KOMPUTE)
+#if defined(GGML_USE_KOMPUTE)
buft = ggml_backend_kompute_buffer_type(device);
-#elif defined(GGML_USE_CANN)
- buft = ggml_backend_cann_buffer_type(device);
#endif
if (buft == nullptr) {
}
}
-#ifdef GGML_USE_SYCL
- if (ggml_backend_sycl_get_device_count() > 1) {
- buft = ggml_backend_sycl_split_buffer_type(tensor_split);
- }
-#endif
-
if (buft == nullptr) {
buft = llama_default_buffer_type_offload(model, fallback_gpu);
}
}
static size_t llama_get_device_memory(const llama_model & model, int device) {
-#if defined(GGML_USE_RPC)
- int rpc_count = (int)model.rpc_servers.size();
- if (device < rpc_count) {
- size_t total;
- size_t free;
- const char * endpoint = model.rpc_servers[device].c_str();
- ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
- return free;
- }
- device = device - rpc_count;
-#endif
-
if (device < (int)model.devices.size()) {
ggml_backend_dev_t dev = model.devices[device];
size_t total;
return free;
}
-#if defined(GGML_USE_SYCL)
- size_t total;
- size_t free;
- ggml_backend_sycl_get_device_memory(device, &free, &total);
- return free;
-#elif defined(GGML_USE_VULKAN)
- size_t total;
- size_t free;
- ggml_backend_vk_get_device_memory(device, &free, &total);
- return free;
-#elif defined(GGML_USE_CANN)
- size_t total;
- size_t free;
- ggml_backend_cann_get_device_memory(device, &free, &total);
- return free;
-#else
+ if (model.devices.size() > 0) {
+ ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(model.devices[0]);
+ LLAMA_LOG_WARN("%s: failed to get free memmory of device:%d of backend:%s, for device id is out of range.\n", __func__, device, ggml_backend_reg_name(reg));
+ } else {
+ LLAMA_LOG_WARN("%s: failed to get free memmory of device, no devices in inputted model.\n", __func__);
+ }
return 1;
-#endif
+
GGML_UNUSED(model);
GGML_UNUSED(device);
}
}
};
+// temporary allocate memory for the input batch if needed
+static const llama_seq_id batch_default_seq_id = 0;
+struct llama_batch_allocr {
+ std::array<llama_seq_id, 1> seq_id_0 = {batch_default_seq_id};
+ std::vector<llama_pos> pos;
+ std::vector<int32_t> n_seq_id;
+ std::vector<llama_seq_id *> seq_id;
+ std::vector<int8_t> logits;
+ struct llama_batch batch;
+ // optionally fulfill the batch returned by llama_batch_get_one
+ llama_batch_allocr(llama_context & ctx, struct llama_batch in_batch) {
+ batch = in_batch;
+ GGML_ASSERT(batch.n_tokens > 0);
+ if (!batch.pos) {
+ // determine the last position in KV cache
+ llama_pos last_pos = -1;
+ for (const auto & cell : ctx.kv_self.cells) {
+ if (cell.has_seq_id(batch_default_seq_id)) {
+ last_pos = std::max(last_pos, cell.pos);
+ }
+ }
+ last_pos++; // next position
+ pos.resize(batch.n_tokens);
+ for (int32_t i = 0; i < batch.n_tokens; i++) {
+ pos[i] = i+last_pos;
+ }
+ batch.pos = pos.data();
+ }
+ if (!batch.n_seq_id) {
+ n_seq_id.resize(batch.n_tokens);
+ for (int32_t i = 0; i < batch.n_tokens; i++) {
+ n_seq_id[i] = seq_id_0.size();
+ }
+ batch.n_seq_id = n_seq_id.data();
+ }
+ if (!batch.seq_id) {
+ seq_id.resize(batch.n_tokens + 1);
+ seq_id[batch.n_tokens] = NULL;
+ for (int32_t i = 0; i < batch.n_tokens; i++) {
+ seq_id[i] = seq_id_0.data();
+ }
+ batch.seq_id = seq_id.data();
+ }
+ if (!batch.logits) {
+ logits.resize(batch.n_tokens);
+ logits[logits.size() - 1] = true;
+ batch.logits = logits.data();
+ }
+ }
+};
+
template<>
bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) {
uint32_t tmp;
vocab.type = LLAMA_VOCAB_TYPE_NONE;
// default special tokens
- vocab.special_bos_id = -1;
- vocab.special_eos_id = -1;
- vocab.special_unk_id = -1;
- vocab.special_sep_id = -1;
- vocab.special_pad_id = -1;
- vocab.special_cls_id = -1;
- vocab.special_mask_id = -1;
- vocab.linefeed_id = -1;
+ vocab.special_bos_id = LLAMA_TOKEN_NULL;
+ vocab.special_eos_id = LLAMA_TOKEN_NULL;
+ vocab.special_unk_id = LLAMA_TOKEN_NULL;
+ vocab.special_sep_id = LLAMA_TOKEN_NULL;
+ vocab.special_pad_id = LLAMA_TOKEN_NULL;
+ vocab.special_cls_id = LLAMA_TOKEN_NULL;
+ vocab.special_mask_id = LLAMA_TOKEN_NULL;
+ vocab.linefeed_id = LLAMA_TOKEN_NULL;
// read vocab size from metadata
if (!ml.get_key(LLM_KV_VOCAB_SIZE, vocab.n_vocab, false)) {
vocab.special_bos_id = 1;
vocab.special_eos_id = 2;
vocab.special_unk_id = 0;
- vocab.special_sep_id = -1;
- vocab.special_pad_id = -1;
- vocab.special_cls_id = -1;
- vocab.special_mask_id = -1;
+ vocab.special_sep_id = LLAMA_TOKEN_NULL;
+ vocab.special_pad_id = LLAMA_TOKEN_NULL;
+ vocab.special_cls_id = LLAMA_TOKEN_NULL;
+ vocab.special_mask_id = LLAMA_TOKEN_NULL;
} else if (tokenizer_model == "bert") {
vocab.type = LLAMA_VOCAB_TYPE_WPM;
// default special tokens
- vocab.special_bos_id = -1;
- vocab.special_eos_id = -1;
+ vocab.special_bos_id = LLAMA_TOKEN_NULL;
+ vocab.special_eos_id = LLAMA_TOKEN_NULL;
vocab.special_unk_id = 100;
vocab.special_sep_id = 102;
vocab.special_pad_id = 0;
// default special tokens
vocab.special_bos_id = 11;
vocab.special_eos_id = 11;
- vocab.special_unk_id = -1;
- vocab.special_sep_id = -1;
- vocab.special_pad_id = -1;
- vocab.special_cls_id = -1;
- vocab.special_mask_id = -1;
+ vocab.special_unk_id = LLAMA_TOKEN_NULL;
+ vocab.special_sep_id = LLAMA_TOKEN_NULL;
+ vocab.special_pad_id = LLAMA_TOKEN_NULL;
+ vocab.special_cls_id = LLAMA_TOKEN_NULL;
+ vocab.special_mask_id = LLAMA_TOKEN_NULL;
} else if (tokenizer_model == "t5") {
vocab.type = LLAMA_VOCAB_TYPE_UGM;
// default special tokens
- vocab.special_bos_id = -1;
+ vocab.special_bos_id = LLAMA_TOKEN_NULL;
vocab.special_eos_id = 1;
vocab.special_unk_id = 2;
- vocab.special_sep_id = -1;
+ vocab.special_sep_id = LLAMA_TOKEN_NULL;
vocab.special_pad_id = 0;
- vocab.special_cls_id = -1;
- vocab.special_mask_id = -1;
+ vocab.special_cls_id = LLAMA_TOKEN_NULL;
+ vocab.special_mask_id = LLAMA_TOKEN_NULL;
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
if (precompiled_charsmap_keyidx != -1) {
vocab.type = LLAMA_VOCAB_TYPE_RWKV;
// default special tokens
- vocab.special_bos_id = -1;
- vocab.special_eos_id = -1;
- vocab.special_unk_id = -1;
- vocab.special_sep_id = -1;
- vocab.special_pad_id = -1;
+ vocab.special_bos_id = LLAMA_TOKEN_NULL;
+ vocab.special_eos_id = LLAMA_TOKEN_NULL;
+ vocab.special_unk_id = LLAMA_TOKEN_NULL;
+ vocab.special_sep_id = LLAMA_TOKEN_NULL;
+ vocab.special_pad_id = LLAMA_TOKEN_NULL;
} else {
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
}
} else if (
tokenizer_pre == "chatglm-bpe") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
- vocab.special_bos_id = -1;
+ vocab.special_bos_id = LLAMA_TOKEN_NULL;
} else if (
tokenizer_pre == "viking") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
- // For Fill-In-the-Middle (FIM)/infill models which where converted
- // prior to support of FIM special tokens in GGUF, the following
- // will allow those models to continue to work. The general names
- // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
- // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
- // new versions of these models have been published.
- std::string gen_name;
- ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
-
- std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
- [](unsigned char c){ return std::tolower(c); });
-
- if (gen_name.find("code") != std::string::npos) {
- if (model.arch == LLM_ARCH_LLAMA
- && 32010 < vocab.id_to_token.size()
- && vocab.id_to_token[32007].text.find("<PRE>") != std::string::npos
- && vocab.id_to_token[32008].text.find("<SUF>") != std::string::npos
- && vocab.id_to_token[32009].text.find("<MID>") != std::string::npos
- && vocab.id_to_token[32010].text.find("<EOT>") != std::string::npos) {
- vocab.special_prefix_id = 32007;
- vocab.special_suffix_id = 32008;
- vocab.special_middle_id = 32009;
- vocab.special_eot_id = 32010;
- } else if (model.arch == LLM_ARCH_GEMMA
- && 107 < vocab.id_to_token.size()
- && vocab.id_to_token[67].text == "<|fim_prefix|>"
- && vocab.id_to_token[69].text == "<|fim_suffix|>"
- && vocab.id_to_token[68].text == "<|fim_middle|>"
- && vocab.id_to_token[107].text == "<end_of_turn>") {
- vocab.special_prefix_id = 67;
- vocab.special_suffix_id = 69;
- vocab.special_middle_id = 68;
- // TODO: this is not EOT, it is "file separator" token, needs fix
- // https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
- //vocab.special_eot_id = 70;
- vocab.special_eot_id = 107;
- }
- }
try {
vocab.linefeed_id = llama_byte_to_token_impl(vocab, '\n');
} catch (const std::exception & e) {
// special tokens
{
const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
- { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
- { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
- { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
- { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
- { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
- { LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
- { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
- { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_prefix_id },
- { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
- { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
- { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
- { LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id },
+ { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
+ { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
+ { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
+ { LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id },
+ { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
+ { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
+ { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
+ { LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
+ { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
+ { LLM_KV_TOKENIZER_FIM_PRE_ID, vocab.special_fim_pre_id },
+ { LLM_KV_TOKENIZER_FIM_SUF_ID, vocab.special_fim_suf_id },
+ { LLM_KV_TOKENIZER_FIM_MID_ID, vocab.special_fim_mid_id },
+ { LLM_KV_TOKENIZER_FIM_PAD_ID, vocab.special_fim_pad_id },
+ { LLM_KV_TOKENIZER_FIM_REP_ID, vocab.special_fim_rep_id },
+ { LLM_KV_TOKENIZER_FIM_SEP_ID, vocab.special_fim_sep_id },
+
+ // deprecated
+ { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_fim_pre_id },
+ { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_fim_suf_id },
+ { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_fim_mid_id },
};
for (const auto & it : special_token_types) {
}
}
- // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
- //
- // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
- // for now, we apply this workaround to find the EOT token based on its text
- if (vocab.special_eot_id == -1) {
- for (const auto & t : vocab.token_to_id) {
+ // auto-detect special tokens by text
+ // TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
+ // for now, we apply this workaround to find the tokens based on their text
+
+ for (const auto & t : vocab.token_to_id) {
+ // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
+ if (vocab.special_eot_id == LLAMA_TOKEN_NULL) {
if (false
- // TODO: gemma "<end_of_turn>" is exported as a normal token, so the following check does not work
- // need to fix convert script
- //vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
|| t.first == "<|eot_id|>"
|| t.first == "<|im_end|>"
|| t.first == "<|end|>"
|| t.first == "<end_of_turn>"
|| t.first == "<|endoftext|>"
|| t.first == "<EOT>"
+ || t.first == "<|end▁of▁sentence|>" // DeepSeek
) {
vocab.special_eot_id = t.second;
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
- LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
- __func__, t.first.c_str());
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+ __func__, t.second, t.first.c_str());
+ vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+ }
+ }
+ }
+
+ // find EOM token: "<|eom_id|>"
+ if (vocab.special_eom_id == LLAMA_TOKEN_NULL) {
+ if (false
+ || t.first == "<|eom_id|>"
+ ) {
+ vocab.special_eom_id = t.second;
+ if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+ __func__, t.second, t.first.c_str());
+ vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+ }
+ }
+ }
+
+ // find FIM_PRE token: "<|fim_prefix|>", "<fim-prefix>", "<PRE>", etc.
+ if (vocab.special_fim_pre_id == LLAMA_TOKEN_NULL) {
+ if (false
+ || t.first == "<|fim_prefix|>" // Qwen
+ || t.first == "<fim-prefix>"
+ || t.first == "<|fim▁begin|>" // DeepSeek
+ || t.first == "<PRE>"
+ ) {
+ vocab.special_fim_pre_id = t.second;
+ if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+ __func__, t.second, t.first.c_str());
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
- break;
}
}
- }
- // find EOM token: "<|eom_id|>"
- //
- // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID
- // for now, we apply this workaround to find the EOM token based on its text
- if (vocab.special_eom_id == -1) {
- const auto & t = vocab.token_to_id.find("<|eom_id|>");
- if (t != vocab.token_to_id.end()) {
- vocab.special_eom_id = t->second;
- if ((vocab.id_to_token[t->second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
- LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
- __func__, t->first.c_str());
- vocab.id_to_token[t->second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+ // find FIM_SUF token: "<|fim_suffix|>", "<fim-suffix>", "<SUF>", etc.
+ if (vocab.special_fim_suf_id == LLAMA_TOKEN_NULL) {
+ if (false
+ || t.first == "<|fim_suffix|>" // Qwen
+ || t.first == "<fim-suffix>"
+ || t.first == "<|fim▁hole|>" // DeepSeek
+ || t.first == "<SUF>"
+ ) {
+ vocab.special_fim_suf_id = t.second;
+ if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+ __func__, t.second, t.first.c_str());
+ vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+ }
+ }
+ }
+
+ // find FIM_MID token: "<|fim_middle|>", "<fim-middle>", "<MID>", etc.
+ if (vocab.special_fim_mid_id == LLAMA_TOKEN_NULL) {
+ if (false
+ || t.first == "<|fim_middle|>" // Qwen
+ || t.first == "<fim-middle>"
+ || t.first == "<|fim▁end|>" // DeepSeek
+ || t.first == "<MID>"
+ ) {
+ vocab.special_fim_mid_id = t.second;
+ if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+ __func__, t.second, t.first.c_str());
+ vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+ }
+ }
+ }
+
+ // find FIM_PAD token: "<|fim_pad|>", "<fim-pad>", "<PAD>", etc.
+ if (vocab.special_fim_pad_id == LLAMA_TOKEN_NULL) {
+ if (false
+ || t.first == "<|fim_pad|>" // Qwen
+ || t.first == "<fim-pad>"
+ || t.first == "<PAD>"
+ ) {
+ vocab.special_fim_pad_id = t.second;
+ if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+ __func__, t.second, t.first.c_str());
+ vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+ }
+ }
+ }
+
+ // find FIM_REP token: "<|fim_repo|>", "<fim-repo>", "<REP>", etc.
+ if (vocab.special_fim_rep_id == LLAMA_TOKEN_NULL) {
+ if (false
+ || t.first == "<|fim_repo|>" // Qwen
+ || t.first == "<|repo_name|>"
+ || t.first == "<fim-repo>"
+ || t.first == "<REPO>"
+ ) {
+ vocab.special_fim_rep_id = t.second;
+ if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+ __func__, t.second, t.first.c_str());
+ vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+ }
+ }
+ }
+
+ // find FIM_SEP token: "<|file_sep|>"
+ if (vocab.special_fim_sep_id == LLAMA_TOKEN_NULL) {
+ if (false
+ || t.first == "<|file_sep|>" // Qwen
+ ) {
+ vocab.special_fim_sep_id = t.second;
+ if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+ __func__, t.second, t.first.c_str());
+ vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+ }
}
}
}
// this is currently determined based on the token text, which is obviously not ideal
// ref: https://github.com/ggerganov/llama.cpp/issues/9606
vocab.special_eog_ids.clear();
+
+ if (vocab.special_fim_pad_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_pad_id) == 0) {
+ vocab.special_eog_ids.insert(vocab.special_fim_pad_id);
+ }
+
+ if (vocab.special_fim_rep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_rep_id) == 0) {
+ vocab.special_eog_ids.insert(vocab.special_fim_rep_id);
+ }
+
+ if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_sep_id) == 0) {
+ vocab.special_eog_ids.insert(vocab.special_fim_sep_id);
+ }
+
for (const auto & t : vocab.token_to_id) {
if (false
|| t.first == "<|eot_id|>"
) {
vocab.special_eog_ids.insert(t.second);
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
- LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
- __func__, t.first.c_str());
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+ __func__, t.second, t.first.c_str());
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
}
+ } else {
+ // token is control, but not marked as EOG -> print a debug log
+ if (vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && vocab.special_eog_ids.count(t.second) == 0) {
+ LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
+ __func__, t.second, t.first.c_str());
+ }
}
}
- if (vocab.special_eos_id != -1 && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
+ // sanity checks
+ if (vocab.special_eos_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
vocab.special_eog_ids.insert(vocab.special_eos_id);
LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
}
- if (vocab.special_eot_id != -1 && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
+ if (vocab.special_eot_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
vocab.special_eog_ids.insert(vocab.special_eot_id);
LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
}
- if (vocab.special_eom_id != -1 && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
+ if (vocab.special_eom_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
vocab.special_eog_ids.insert(vocab.special_eom_id);
LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
}
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
// special tokens
- if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
- if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
- if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
- if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
- if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
- if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
- if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
-
- if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
- if (vocab.special_prefix_id != -1) { LLAMA_LOG_INFO( "%s: PRE token = %d '%s'\n", __func__, vocab.special_prefix_id, vocab.id_to_token[vocab.special_prefix_id].text.c_str() ); }
- if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
- if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
- if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
- if (vocab.special_eom_id != -1) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, vocab.special_eom_id, vocab.id_to_token[vocab.special_eom_id].text.c_str() ); }
+ if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
+ if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
+ if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
+ if (vocab.special_eom_id != -1) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, vocab.special_eom_id, vocab.id_to_token[vocab.special_eom_id].text.c_str() ); }
+ if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
+ if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
+ if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
+ if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
+ if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
+
+ if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
+
+ if (vocab.special_fim_pre_id != -1) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, vocab.special_fim_pre_id, vocab.id_to_token[vocab.special_fim_pre_id].text.c_str() ); }
+ if (vocab.special_fim_suf_id != -1) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, vocab.special_fim_suf_id, vocab.id_to_token[vocab.special_fim_suf_id].text.c_str() ); }
+ if (vocab.special_fim_mid_id != -1) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, vocab.special_fim_mid_id, vocab.id_to_token[vocab.special_fim_mid_id].text.c_str() ); }
+ if (vocab.special_fim_pad_id != -1) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, vocab.special_fim_pad_id, vocab.id_to_token[vocab.special_fim_pad_id].text.c_str() ); }
+ if (vocab.special_fim_rep_id != -1) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, vocab.special_fim_rep_id, vocab.id_to_token[vocab.special_fim_rep_id].text.c_str() ); }
+ if (vocab.special_fim_sep_id != -1) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, vocab.special_fim_sep_id, vocab.id_to_token[vocab.special_fim_sep_id].text.c_str() ); }
for (const auto & id : vocab.special_eog_ids) {
LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() );
// assign cpu layers
for (int i = 0; i < i_gpu_start; ++i) {
+#ifdef GGML_USE_AMX
+ model.buft_layer[i] = {
+ ggml_backend_amx_buffer_type(),
+ llama_default_buffer_type_cpu(model, true)
+ };
+#else
model.buft_layer[i] = llama_default_buffer_type_cpu(model, true);
+#endif
}
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
llama_buf_map bufs;
bufs.reserve(n_max_backend_buffer);
- // only the mmap region containing the tensors in the model is mapped to the backend buffer
- // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
- // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
- if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(model, true)) {
- for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
- void * addr = nullptr;
- size_t first, last;
- ml.get_mapping_range(&first, &last, &addr, idx, ctx);
- if (first >= last) {
- continue;
- }
- ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first);
- if (buf == nullptr) {
- throw std::runtime_error("unable to allocate backend CPU buffer");
- }
- model.bufs.push_back(buf);
- bufs.emplace(idx, buf);
- }
+ // check if this backend device supports buffer_from_host_ptr
+ // when using a host buffer as the CPU bakcend buffer, use the CPU device to prioritize using buffer_from_host_ptr over the host buffer
+ ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft == llama_default_buffer_type_cpu(model, true) ? ggml_backend_cpu_buffer_type() : buft);
+ bool buffer_from_host_ptr_supported = false;
+ if (dev) {
+ ggml_backend_dev_props props;
+ ggml_backend_dev_get_props(dev, &props);
+ buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
}
-#ifdef GGML_USE_METAL
- else if (ml.use_mmap && use_mmap_buffer && buft == ggml_backend_metal_buffer_type()) {
+
+ if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported) {
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
- const size_t max_size = ggml_get_max_tensor_size(ctx);
+ // only the mmap region containing the tensors in the model is mapped to the backend buffer
+ // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
+ // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
void * addr = nullptr;
- size_t first, last;
+ size_t first, last; // NOLINT
ml.get_mapping_range(&first, &last, &addr, idx, ctx);
if (first >= last) {
continue;
}
- ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
+ const size_t max_size = ggml_get_max_tensor_size(ctx);
+ ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
if (buf == nullptr) {
- throw std::runtime_error("unable to allocate backend metal buffer");
+ throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
}
model.bufs.push_back(buf);
bufs.emplace(idx, buf);
}
}
-#endif
else {
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
if (buf == nullptr) {
- throw std::runtime_error("unable to allocate backend buffer");
+ throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
}
model.bufs.push_back(buf);
if (use_mlock && ggml_backend_buffer_is_host(buf)) {
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_GEMMA2) {
- ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
- }
+ ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
} else {
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
cb(kq, "kq", il);
- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2 || model.arch == LLM_ARCH_NEMOTRON || model.arch == LLM_ARCH_CHATGLM) {
- // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
- // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
- ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
- }
+ // note: this op tends to require high floating point range
+ // while for some models F16 is enough, for others it is not, so we default to F32 here
+ ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
if (model.arch == LLM_ARCH_GROK) {
// need to do the following:
// kq = 30 * tanh(kq / 30)
// before the softmax below
- //try from phi2
- //ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
-
kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
kq = ggml_scale(ctx, kq, 30);
}
llama_context & lctx;
const llama_hparams & hparams;
const llama_cparams & cparams;
- const llama_ubatch & batch;
+ const llama_ubatch & ubatch;
const llama_kv_cache & kv_self;
const int64_t n_embd;
// TODO: consider making the entire interface noexcept
llm_build_context(
llama_context & lctx,
- const llama_ubatch & batch,
+ const llama_ubatch & ubatch,
const llm_build_cb & cb,
bool worst_case) :
model (lctx.model),
lctx (lctx),
hparams (model.hparams),
cparams (lctx.cparams),
- batch (batch),
+ ubatch (ubatch),
kv_self (lctx.kv_self),
n_embd (hparams.n_embd),
n_layer (hparams.n_layer),
beta_slow (cparams.yarn_beta_slow),
norm_eps (hparams.f_norm_eps),
norm_rms_eps (hparams.f_norm_rms_eps),
- n_tokens (batch.n_tokens),
+ n_tokens (ubatch.n_tokens),
n_kv (worst_case ? kv_self.size : kv_self.n),
n_outputs (worst_case ? n_tokens : lctx.n_outputs),
n_outputs_enc (worst_case ? n_tokens : lctx.embd_enc.size() / hparams.n_embd),
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = model.type == MODEL_7B ? build_inp_pos() : nullptr;
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// multiply by embedding_multiplier_scale of 78.38367176906169
inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
}
// construct input embeddings (token, type, position)
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// token types are hardcoded to zero ("Sentence A")
struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
struct ggml_tensor * pos;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
struct ggml_tensor * ffn_output;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
struct ggml_tensor * pos;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// scale the input embeddings
inpL = ggml_scale(ctx0, inpL, scale_embd);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// scale the input embeddings
inpL = ggml_scale(ctx0, inpL, scale_embd);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
cb(inpL, "inp_scaled", -1);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
cb(inpL, "inp_scaled", -1);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
struct ggml_tensor * inpL;
// {n_embd, n_tokens}
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
struct ggml_tensor * state_copy = build_inp_s_copy();
struct ggml_tensor * state_mask = build_inp_s_mask();
LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);
- cur = llm_build_mamba(ctx0, lctx, batch, gf, cur,
+ cur = llm_build_mamba(ctx0, lctx, ubatch, gf, cur,
state_copy, state_mask,
kv_head, n_kv, cb, il);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
struct ggml_tensor * inpL;
// {n_embd, n_tokens}
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
GGML_ASSERT(lctx.is_encoding);
struct ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
GGML_ASSERT(!lctx.is_encoding);
GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
// Token shift state dimensions should be 2 * n_emb
GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2);
- const int64_t n_seqs = batch.n_seqs;
- const int64_t n_seq_tokens = batch.n_seq_tokens;
- const int64_t n_tokens = batch.n_tokens;
+ const int64_t n_seqs = ubatch.n_seqs;
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+ const int64_t n_tokens = ubatch.n_tokens;
GGML_ASSERT(n_seqs != 0);
- GGML_ASSERT(batch.equal_seqs);
+ GGML_ASSERT(ubatch.equal_seqs);
GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
struct ggml_tensor * cur;
struct ggml_tensor * state_copy = build_inp_s_copy();
struct ggml_tensor * state_mask = build_inp_s_mask();
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
for (int il = 0; il < n_layer; ++il) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1);
- cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+ cb(cur, "result_norm", -1);
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
cb(cur, "result_output", -1);
+
ggml_build_forward_expand(gf, cur);
return gf;
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
- inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
static struct ggml_cgraph * llama_build_graph(
llama_context & lctx,
- const llama_ubatch & batch,
+ const llama_ubatch & ubatch,
bool worst_case) {
const auto & model = lctx.model;
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
// FIXME: fix in ggml_backend_sched
const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
- if (batch.n_tokens < 32 || full_offload) {
+ if (ubatch.n_tokens < 32 || full_offload) {
if (il != -1 && strcmp(name, "norm") == 0) {
for (auto * backend : lctx.backends) {
if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft) &&
struct ggml_cgraph * result = NULL;
- struct llm_build_context llm(lctx, batch, cb, worst_case);
+ struct llm_build_context llm(lctx, ubatch, cb, worst_case);
llm.init();
return relative_bucket;
}
-static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
+static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
//
// set input data
//
const auto & cparams = lctx.cparams;
const auto & kv_self = lctx.kv_self;
- if (batch.token) {
- const int64_t n_tokens = batch.n_tokens;
+ if (ubatch.token) {
+ const int64_t n_tokens = ubatch.n_tokens;
- ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
+ ggml_backend_tensor_set(lctx.inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
}
- if (batch.embd) {
+ if (ubatch.embd) {
const int64_t n_embd = hparams.n_embd;
- const int64_t n_tokens = batch.n_tokens;
+ const int64_t n_tokens = ubatch.n_tokens;
- ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
+ ggml_backend_tensor_set(lctx.inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
}
- if (batch.pos && lctx.inp_pos) {
- const int64_t n_tokens = batch.n_tokens;
+ if (ubatch.pos && lctx.inp_pos) {
+ const int64_t n_tokens = ubatch.n_tokens;
- ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
+ ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
}
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
- const int64_t n_tokens = batch.n_tokens;
+ const int64_t n_tokens = ubatch.n_tokens;
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
int32_t * data = (int32_t *) lctx.inp_out_ids->data;
for (int i = 0; i < n_tokens; ++i) {
data[i] = i;
}
- } else if (batch.output) {
+ } else if (ubatch.output) {
int32_t n_outputs = 0;
for (int i = 0; i < n_tokens; ++i) {
- if (batch.output[i]) {
+ if (ubatch.output[i]) {
data[n_outputs++] = i;
}
}
// NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
if (cparams.causal_attn && !lctx.is_encoding) {
const int64_t n_kv = kv_self.n;
- const int64_t n_tokens = batch.n_tokens;
- const int64_t n_seq_tokens = batch.n_seq_tokens;
- const int64_t n_seqs = batch.n_seqs;
+ const int64_t n_tokens = ubatch.n_tokens;
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+ const int64_t n_seqs = ubatch.n_seqs;
float * data = nullptr;
}
// For causal attention, use only the previous KV cells
- // of the correct sequence for each token of the batch.
+ // of the correct sequence for each token of the ubatch.
// It's assumed that if a token in the batch has multiple sequences, they are equivalent.
for (int h = 0; h < 1; ++h) {
for (int s = 0; s < n_seqs; ++s) {
- const llama_seq_id seq_id = batch.seq_id[s][0];
+ const llama_seq_id seq_id = ubatch.seq_id[s][0];
for (int j = 0; j < n_seq_tokens; ++j) {
- const llama_pos pos = batch.pos[s*n_seq_tokens + j];
+ const llama_pos pos = ubatch.pos[s*n_seq_tokens + j];
for (int i = 0; i < n_kv; ++i) {
float f;
}
}
} else {
- const int64_t n_tokens = batch.n_tokens;
- const int64_t n_seq_tokens = batch.n_seq_tokens;
- const int64_t n_seqs = batch.n_seqs;
+ const int64_t n_tokens = ubatch.n_tokens;
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+ const int64_t n_seqs = ubatch.n_seqs;
// when using kv cache, the mask needs to match the kv cache size
const int64_t n_stride = hparams.causal_attn && !lctx.is_encoding ? kv_self.n : n_tokens;
for (int h = 0; h < 1; ++h) {
for (int s1 = 0; s1 < n_seqs; ++s1) {
- const llama_seq_id seq_id = batch.seq_id[s1][0];
+ const llama_seq_id seq_id = ubatch.seq_id[s1][0];
for (int j = 0; j < n_seq_tokens; ++j) {
const int32_t tj = s1*n_seq_tokens + j;
const int32_t ti = s0*n_seq_tokens + i;
float f = -INFINITY;
- for (int s = 0; s < batch.n_seq_id[s0]; ++s) {
- if (batch.seq_id[s0][s] == seq_id) {
+ for (int s = 0; s < ubatch.n_seq_id[s0]; ++s) {
+ if (ubatch.seq_id[s0][s] == seq_id) {
if (hparams.use_alibi) {
- f = -std::abs(batch.pos[ti] - batch.pos[tj]);
+ f = -std::abs(ubatch.pos[ti] - ubatch.pos[tj]);
} else {
f = 0.0f;
}
}
if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
- const int64_t n_tokens = batch.n_tokens;
- const int64_t n_seq_tokens = batch.n_seq_tokens;
- const int64_t n_seqs = batch.n_seqs;
+ const int64_t n_tokens = ubatch.n_tokens;
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+ const int64_t n_seqs = ubatch.n_seqs;
GGML_ASSERT(lctx.inp_mean);
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
std::vector<uint64_t> sum(n_tokens, 0);
for (int s = 0; s < n_seqs; ++s) {
- const llama_seq_id seq_id = batch.seq_id[s][0];
+ const llama_seq_id seq_id = ubatch.seq_id[s][0];
- // TODO: adapt limits to n_seqs when batch.equal_seqs is true
+ // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
- sum[seq_id] += batch.n_seq_tokens;
+ sum[seq_id] += ubatch.n_seq_tokens;
}
std::vector<float> div(n_tokens, 0.0f);
}
for (int s = 0; s < n_seqs; ++s) {
- const llama_seq_id seq_id = batch.seq_id[s][0];
+ const llama_seq_id seq_id = ubatch.seq_id[s][0];
for (int i = 0; i < n_seq_tokens; ++i) {
data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id];
if (cparams.embeddings && (
cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
- const int64_t n_tokens = batch.n_tokens;
- const int64_t n_seq_tokens = batch.n_seq_tokens;
- const int64_t n_seqs = batch.n_seqs;
+ const int64_t n_tokens = ubatch.n_tokens;
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+ const int64_t n_seqs = ubatch.n_seqs;
GGML_ASSERT(lctx.inp_cls);
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
for (int s = 0; s < n_seqs; ++s) {
- const llama_seq_id seq_id = batch.seq_id[s][0];
+ const llama_seq_id seq_id = ubatch.seq_id[s][0];
- // TODO: adapt limits to n_seqs when batch.equal_seqs is true
+ // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
for (int i = 0; i < n_seq_tokens; ++i) {
- const llama_pos pos = batch.pos[s*n_seq_tokens + i];
+ const llama_pos pos = ubatch.pos[s*n_seq_tokens + i];
if (pos == 0) {
data[seq_id] = s*n_seq_tokens + i;
}
if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
- const int64_t n_tokens = batch.n_tokens;
- const int64_t n_seq_tokens = batch.n_seq_tokens;
- const int64_t n_seqs = batch.n_seqs;
+ const int64_t n_tokens = ubatch.n_tokens;
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
+ const int64_t n_seqs = ubatch.n_seqs;
GGML_ASSERT(lctx.inp_cls);
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
std::vector<int> last_row(n_tokens, -1);
for (int s = 0; s < n_seqs; ++s) {
- const llama_seq_id seq_id = batch.seq_id[s][0];
+ const llama_seq_id seq_id = ubatch.seq_id[s][0];
- // TODO: adapt limits to n_seqs when batch.equal_seqs is true
+ // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
for (int i = 0; i < n_seq_tokens; ++i) {
- const llama_pos pos = batch.pos[s*n_seq_tokens + i];
+ const llama_pos pos = ubatch.pos[s*n_seq_tokens + i];
if (pos >= last_pos[seq_id]) {
last_pos[seq_id] = pos;
}
if (lctx.inp_pos_bucket) {
- const int64_t n_tokens = batch.n_tokens;
+ const int64_t n_tokens = ubatch.n_tokens;
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_pos_bucket->buffer));
- GGML_ASSERT(!batch.equal_seqs); // TODO: use batch.n_seqs instead of failing
+ GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing
int32_t * data = (int32_t *) lctx.inp_pos_bucket->data;
for (int h = 0; h < 1; ++h) {
for (int j = 0; j < n_tokens; ++j) {
for (int i = 0; i < n_kv; ++i) {
- data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(lctx.kv_self.cells[i].pos, batch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(lctx.kv_self.cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
}
}
}
for (int h = 0; h < 1; ++h) {
for (int j = 0; j < n_tokens; ++j) {
for (int i = 0; i < n_tokens; ++i) {
- data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(batch.pos[i], batch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
+ data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch.pos[i], ubatch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
}
}
}
if (!lctx.is_encoding && lctx.inp_KQ_mask_cross) {
const int64_t n_output_enc = lctx.embd_enc.size() / hparams.n_embd;
- const int64_t n_tokens = batch.n_tokens;
+ const int64_t n_tokens = ubatch.n_tokens;
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_cross->buffer));
- GGML_ASSERT(!batch.equal_seqs); // TODO: use batch.n_seqs instead of failing
+ GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing
float * data = (float *) lctx.inp_KQ_mask_cross->data;
for (int j = 0; j < n_tokens; ++j) {
for (int i = 0; i < n_output_enc; ++i) {
float f = -INFINITY;
- for (int s = 0; s < batch.n_seq_id[j]; ++s) {
- const llama_seq_id seq_id = batch.seq_id[j][s];
+ for (int s = 0; s < ubatch.n_seq_id[j]; ++s) {
+ const llama_seq_id seq_id = ubatch.seq_id[j][s];
if (lctx.seq_ids_enc[i].find(seq_id) != lctx.seq_ids_enc[i].end()) {
f = 0.0f;
}
int n_threads,
ggml_threadpool * threadpool) {
if (lctx.backend_cpu != nullptr) {
- ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
}
-#ifdef GGML_USE_BLAS
- if (lctx.backend_blas != nullptr) {
- ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads);
+
+ // set the number of threads for all the backends
+ for (const auto & set_n_threads_fn : lctx.set_n_threads_fns) {
+ set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
}
-#endif
- ggml_backend_sched_graph_compute_async(lctx.sched, gf);
+ auto err = ggml_backend_sched_graph_compute_async(lctx.sched, gf);
+ if (err != GGML_STATUS_SUCCESS) {
+ LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, err);
+ }
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
}
//
static int llama_decode_internal(
llama_context & lctx,
- llama_batch batch_all) { // TODO: rename back to batch
+ llama_batch inp_batch) {
lctx.is_encoding = false;
- const uint32_t n_tokens_all = batch_all.n_tokens;
- if (n_tokens_all == 0) {
+ if (inp_batch.n_tokens == 0) {
LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
return -1;
}
+ // temporary allocate memory for the input batch if needed
+ llama_batch_allocr batch_allocr(lctx, inp_batch);
+ const llama_batch & batch = batch_allocr.batch;
+ const uint32_t n_tokens_all = batch.n_tokens;
+
const auto & model = lctx.model;
const auto & hparams = model.hparams;
const auto & cparams = lctx.cparams;
- GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT
+ GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
- if (batch_all.token) {
+ if (batch.token) {
for (uint32_t i = 0; i < n_tokens_all; ++i) {
- if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {
- LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch_all.token[i]);
+ if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
+ LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
return -1;
}
}
lctx.embd_seq.clear();
// count outputs
- if (batch_all.logits && !embd_pooled) {
+ if (batch.logits && !embd_pooled) {
for (uint32_t i = 0; i < n_tokens_all; ++i) {
- n_outputs += batch_all.logits[i] != 0;
+ n_outputs += batch.logits[i] != 0;
}
} else if (lctx.logits_all || embd_pooled) {
n_outputs = n_tokens_all;
n_outputs = 1;
}
- lctx.sbatch.from_batch(batch_all, n_embd,
+ lctx.sbatch.from_batch(batch, n_embd,
/* simple_split */ !kv_self.recurrent,
/* logits_all */ n_outputs == n_tokens_all);
//
static int llama_encode_internal(
llama_context & lctx,
- llama_batch batch) {
+ llama_batch inp_batch) {
lctx.is_encoding = true;
- const uint32_t n_tokens = batch.n_tokens;
-
- if (n_tokens == 0) {
+ if (inp_batch.n_tokens == 0) {
LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
return -1;
}
+ // temporary allocate memory for the input batch if needed
+ llama_batch_allocr batch_allocr(lctx, inp_batch);
+ const llama_batch & batch = batch_allocr.batch;
+ const uint32_t n_tokens = batch.n_tokens;
+
const auto & model = lctx.model;
const auto & hparams = model.hparams;
const auto & cparams = lctx.cparams;
}
float * f32_output = (float *) output.data();
- ggml_type_traits_t qtype;
+ const ggml_type_traits * qtype = ggml_get_type_traits(tensor->type);
if (ggml_is_quantized(tensor->type)) {
- qtype = ggml_internal_get_type_traits(tensor->type);
- if (qtype.to_float == NULL) {
+ if (qtype->to_float == NULL) {
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
}
} else if (tensor->type != GGML_TYPE_F16 &&
} else if (tensor->type == GGML_TYPE_BF16) {
ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
} else if (ggml_is_quantized(tensor->type)) {
- qtype.to_float(tensor->data, f32_output, nelements);
+ qtype->to_float(tensor->data, f32_output, nelements);
} else {
GGML_ABORT("fatal error"); // unreachable
}
} else if (typ == GGML_TYPE_BF16) {
ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
} else {
- qtype.to_float(inbuf, outbuf, nels);
+ qtype->to_float(inbuf, outbuf, nels);
}
};
workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
}
bool llama_supports_gpu_offload(void) {
-#if defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
- defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
+#if defined(GGML_USE_KOMPUTE)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
return true;
#else
return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
- ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL) != nullptr;
+ ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL) != nullptr ||
+ llama_supports_rpc();
#endif
}
+bool llama_supports_rpc(void) {
+ return ggml_backend_reg_by_name("RPC") != nullptr;
+}
+
void llama_backend_init(void) {
ggml_time_init();
model->rpc_servers.push_back(servers);
}
+ // add RPC devices
+ if (!model->rpc_servers.empty()) {
+ ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
+ if (!rpc_reg) {
+ LLAMA_LOG_ERROR("%s: failed to find RPC backend\n", __func__);
+ llama_free_model(model);
+ return nullptr;
+ }
+
+ // ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
+ using ggml_backend_rpc_add_device_t = ggml_backend_dev_t (*)(const char *);
+ ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
+ if (!ggml_backend_rpc_add_device_fn) {
+ LLAMA_LOG_ERROR("%s: failed to find RPC device add function\n", __func__);
+ llama_free_model(model);
+ return nullptr;
+ }
+
+ for (const std::string & server : model->rpc_servers) {
+ ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
+ if (dev) {
+ model->devices.push_back(dev);
+ } else {
+ LLAMA_LOG_ERROR("%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
+ llama_free_model(model);
+ return nullptr;
+ }
+ }
+ }
+
// create list of devices to use with this model
// currently, we use all available devices
// TODO: rework API to give user more control over device selection
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
- // skip the CPU backend since it is handled separately
- if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU_FULL) {
- model->devices.push_back(dev);
+ switch (ggml_backend_dev_type(dev)) {
+ case GGML_BACKEND_DEVICE_TYPE_CPU:
+ case GGML_BACKEND_DEVICE_TYPE_CPU_FULL:
+ // skip CPU backends since they are `handled separately
+ break;
+
+ case GGML_BACKEND_DEVICE_TYPE_GPU:
+ case GGML_BACKEND_DEVICE_TYPE_GPU_FULL:
+ {
+ size_t free, total; // NOLINT
+ ggml_backend_dev_memory(dev, &free, &total);
+ LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
+ model->devices.push_back(dev);
+ break;
+ }
}
}
} else if (status == -2) {
LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
}
- delete model;
+ llama_free_model(model);
return nullptr;
}
params.flash_attn = false;
}
- if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
+ if (ggml_is_quantized(params.type_v) && !params.flash_attn) {
LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
return nullptr;
}
main_gpu -= (int)model->devices.size();
}
-#if defined(GGML_USE_RPC)
- if (model->n_gpu_layers > 0) {
- for (const auto & endpoint : model->rpc_servers) {
- ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
- if (backend == nullptr) {
- LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
- llama_free(ctx);
- return nullptr;
- }
- ctx->backends.push_back(backend);
- }
- }
- if (main_gpu >= (int)model->rpc_servers.size()) {
- main_gpu -= (int)model->rpc_servers.size();
- }
-#endif
-
-#if defined(GGML_USE_METAL)
- if (model->n_gpu_layers > 0) {
- ctx->backend_metal = ggml_backend_metal_init();
- if (ctx->backend_metal == nullptr) {
- LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__);
- llama_free(ctx);
- return nullptr;
- }
- ctx->backends.push_back(ctx->backend_metal);
- }
-#elif defined(GGML_USE_VULKAN)
- if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
- LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
- llama_free(ctx);
- return nullptr;
- }
- if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
- ggml_backend_t backend = ggml_backend_vk_init(main_gpu);
- if (backend == nullptr) {
- LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
- llama_free(ctx);
- return nullptr;
- }
- ctx->backends.push_back(backend);
- } else {
- for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
- ggml_backend_t backend = ggml_backend_vk_init(device);
- if (backend == nullptr) {
- LLAMA_LOG_ERROR("%s: failed to initialize Vulkan%d backend\n", __func__, device);
- llama_free(ctx);
- return nullptr;
- }
- ctx->backends.push_back(backend);
- }
- }
-#elif defined(GGML_USE_SYCL)
- // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
- if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
- ggml_backend_t backend = ggml_backend_sycl_init(main_gpu);
- if (backend == nullptr) {
- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, main_gpu);
- llama_free(ctx);
- return nullptr;
- }
- ctx->backends.push_back(backend);
- } else {
- // LLAMA_SPLIT_LAYER requires a backend for each GPU
- for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
- ggml_backend_t backend = ggml_backend_sycl_init(i);
- if (backend == nullptr) {
- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d for No.%d backend\n", __func__, i, i);
- llama_free(ctx);
- return nullptr;
- }
- ctx->backends.push_back(backend);
- }
- }
-#elif defined(GGML_USE_KOMPUTE)
+#if defined(GGML_USE_KOMPUTE)
if (model->n_gpu_layers > 0) {
auto * backend = ggml_backend_kompute_init(main_gpu);
if (backend == nullptr) {
}
ctx->backends.push_back(backend);
}
-#elif defined(GGML_USE_CANN)
- // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
- // TODO: ggml_backend_cann is not support split tensor now, just leave code here.
- if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
- ggml_backend_t backend = ggml_backend_cann_init(main_gpu);
- if (backend == nullptr) {
- LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, main_gpu);
- llama_free(ctx);
- return nullptr;
- }
- ctx->backends.push_back(backend);
- } else {
- // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
- // TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
- for (int32_t device = 0; device < ggml_backend_cann_get_device_count(); ++device) {
- ggml_backend_t backend = ggml_backend_cann_init(device);
+#endif
+
+ // add other backends (such as BLAS)
+ for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+ ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+ if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
+ ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
if (backend == nullptr) {
- LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device);
+ LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
}
-#endif
-
-#ifdef GGML_USE_BLAS
- ctx->backend_blas = ggml_backend_blas_init();
- if (ctx->backend_blas == nullptr) {
- LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__);
- } else {
- ctx->backends.push_back(ctx->backend_blas);
- }
-#endif
ctx->backend_cpu = ggml_backend_cpu_init();
if (ctx->backend_cpu == nullptr) {
}
ctx->backends.push_back(ctx->backend_cpu);
+ // create a list of the set_n_threads functions in the backends
+ for (auto * backend : ctx->backends) {
+ ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+ ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
+ if (reg) {
+ auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+ if (ggml_backend_set_n_threads_fn) {
+ ctx->set_n_threads_fns.emplace_back(backend, ggml_backend_set_n_threads_fn);
+ }
+ }
+ }
+
if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
llama_free(ctx);
}
LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
- (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
+ (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
}
struct llama_batch llama_batch_get_one(
llama_token * tokens,
- int32_t n_tokens,
- llama_pos pos_0,
- llama_seq_id seq_id) {
+ int32_t n_tokens) {
return {
/*n_tokens =*/ n_tokens,
/*tokens =*/ tokens,
/*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
/*logits =*/ nullptr,
- /*all_pos_0 =*/ pos_0,
- /*all_pos_1 =*/ 1,
- /*all_seq_id =*/ seq_id,
};
}
/*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
/*logits =*/ nullptr,
- /*all_pos_0 =*/ 0,
- /*all_pos_1 =*/ 0,
- /*all_seq_id =*/ 0,
};
if (embd) {
struct llama_context * ctx,
struct llama_batch batch) {
const int ret = llama_encode_internal(*ctx, batch);
- if (ret < 0) {
+ if (ret != 0) {
LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret);
}
struct llama_context * ctx,
struct llama_batch batch) {
const int ret = llama_decode_internal(*ctx, batch);
- if (ret < 0) {
+ if (ret != 0) {
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
}
return llama_token_eos_impl(model->vocab);
}
+llama_token llama_token_eot(const struct llama_model * model) {
+ return llama_token_eot_impl(model->vocab);
+}
+
llama_token llama_token_cls(const struct llama_model * model) {
return llama_token_cls_impl(model->vocab);
}
return llama_token_suffix_impl(model->vocab);
}
-llama_token llama_token_eot(const struct llama_model * model) {
- return llama_token_eot_impl(model->vocab);
+llama_token llama_token_fim_pre(const struct llama_model * model) {
+ return llama_token_fim_pre_impl(model->vocab);
+}
+
+llama_token llama_token_fim_suf(const struct llama_model * model) {
+ return llama_token_fim_suf_impl(model->vocab);
+}
+
+llama_token llama_token_fim_mid(const struct llama_model * model) {
+ return llama_token_fim_mid_impl(model->vocab);
+}
+
+llama_token llama_token_fim_pad(const struct llama_model * model) {
+ return llama_token_fim_pad_impl(model->vocab);
+}
+
+llama_token llama_token_fim_rep(const struct llama_model * model) {
+ return llama_token_fim_rep_impl(model->vocab);
+}
+
+llama_token llama_token_fim_sep(const struct llama_model * model) {
+ return llama_token_fim_sep_impl(model->vocab);
}
//
if (add_ass) {
ss << "[|assistant|]";
}
+ } else if (tmpl == "rwkv-world" || tmpl_contains("rwkv-world")) {
+ // this template requires the model to have "\n\n" as EOT token
+ for (auto message : chat) {
+ std::string role(message->role);
+ if (role == "user") {
+ ss << "User: " << message->content << "\n\nAssistant:";
+ } else {
+ ss << message->content << "\n\n";
+ }
+ }
} else {
// template not supported
return -1;
return llama_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root);
}
+struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model) {
+ return llama_sampler_init_infill_impl(model->vocab);
+}
+
+struct llama_sampler * llama_sampler_init_dry(const struct llama_model * model, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
+ return llama_sampler_init_dry_impl(model->vocab, llama_n_ctx_train(model), dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, seq_breakers, num_breakers);
+}
+
//
// model split
//
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
+ s += "AMX_INT8 = " + std::to_string(ggml_cpu_has_amx_int8()) + " | ";
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";