#include "common.h"
#include "log.h"
+#include <algorithm>
#include <cmath>
+#include <cstring>
#include <unordered_map>
-#include <algorithm>
// the ring buffer works similarly to std::deque, but with a fixed capacity
// TODO: deduplicate with llama-impl.h
llama_token_data_array cur_p;
+ void reset() {
+ prev.clear();
+
+ llama_sampler_reset(grmr);
+ llama_sampler_reset(chain);
+ }
+
void set_logits(struct llama_context * ctx, int idx) {
const auto * logits = llama_get_logits_ith(ctx, idx);
cur_p = { cur.data(), cur.size(), -1, false };
}
+
+ common_time_meas tm() {
+ return common_time_meas(t_total_us, params.no_perf);
+ }
+
+ mutable int64_t t_total_us = 0;
};
std::string common_params_sampling::print() const {
}
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
+ const auto tm = gsmpl->tm();
+
if (accept_grammar) {
llama_sampler_accept(gsmpl->grmr, token);
}
}
void common_sampler_reset(struct common_sampler * gsmpl) {
- llama_sampler_reset(gsmpl->grmr);
-
- llama_sampler_reset(gsmpl->chain);
+ gsmpl->reset();
}
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
// TODO: measure grammar performance
+ const double t_sampling_ms = gsmpl ? 1e-3*gsmpl->t_total_us : 0;
+
+ llama_perf_sampler_data data_smpl;
+ llama_perf_context_data data_ctx;
+
+ memset(&data_smpl, 0, sizeof(data_smpl));
+ memset(&data_ctx, 0, sizeof(data_ctx));
+
if (gsmpl) {
- llama_perf_sampler_print(gsmpl->chain);
+ auto & data = data_smpl;
+
+ data = llama_perf_sampler(gsmpl->chain);
+
+ // note: the sampling time includes the samplers time + extra time spent in common/sampling
+ LOG_INF("%s: sampling time = %10.2f ms\n", __func__, t_sampling_ms);
+ LOG_INF("%s: samplers time = %10.2f ms / %5d tokens\n", __func__, data.t_sample_ms, data.n_sample);
}
+
if (ctx) {
- llama_perf_context_print(ctx);
+ auto & data = data_ctx;
+
+ data = llama_perf_context(ctx);
+
+ const double t_end_ms = 1e-3 * ggml_time_us();
+
+ const double t_total_ms = t_end_ms - data.t_start_ms;
+ const double t_unacc_ms = t_total_ms - (t_sampling_ms + data.t_p_eval_ms + data.t_eval_ms);
+ const double t_unacc_pc = 100.0 * t_unacc_ms / t_total_ms;
+
+ LOG_INF("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
+ LOG_INF("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
+ __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
+ LOG_INF("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
+ __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
+ LOG_INF("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
+ LOG_INF("%s: unaccounted time = %10.2f ms / %5.1f %% (total - sampling - prompt eval - eval) / (total)\n", __func__, t_unacc_ms, t_unacc_pc);
+ LOG_INF("%s: graphs reused = %10d\n", __func__, data.n_reused);
+
llama_memory_breakdown_print(ctx);
}
}
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
+ llama_synchronize(ctx);
+
+ // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
+ const auto tm = gsmpl->tm();
+
gsmpl->set_logits(ctx, idx);
auto & grmr = gsmpl->grmr;
// helpers
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
+ const auto tm = gsmpl->tm();
+
auto * res = &gsmpl->cur_p;
if (do_sort && !res->sorted) {
#include "llama.h"
#include "ggml.h"
+#include <cmath>
#include <cstdio>
#include <string>
#include <vector>
-#include <numeric>
/**
* This the arbitrary data which will be passed to each callback.
return u.f;
}
-static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
+static float ggml_get_float_value(const uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
float v;
if (type == GGML_TYPE_F16) {
- v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
+ v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
} else if (type == GGML_TYPE_F32) {
- v = *(float *) &data[i];
+ v = *(const float *) &data[i];
} else if (type == GGML_TYPE_I64) {
- v = (float) *(int64_t *) &data[i];
+ v = (float) *(const int64_t *) &data[i];
} else if (type == GGML_TYPE_I32) {
- v = (float) *(int32_t *) &data[i];
+ v = (float) *(const int32_t *) &data[i];
} else if (type == GGML_TYPE_I16) {
- v = (float) *(int16_t *) &data[i];
+ v = (float) *(const int16_t *) &data[i];
} else if (type == GGML_TYPE_I8) {
- v = (float) *(int8_t *) &data[i];
+ v = (float) *(const int8_t *) &data[i];
} else if (type == GGML_TYPE_BF16) {
- v = ggml_compute_bf16_to_fp32(*(ggml_bf16_t *) &data[i]);
+ v = ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
} else {
GGML_ABORT("fatal error");
}
for (auto * smpl : chain->samplers) {
llama_sampler_reset(smpl);
}
-
- chain->t_sample_us = 0;
- chain->n_sample = 0;
}
static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampler * smpl) {
void llama_perf_sampler_print(const struct llama_sampler * chain) {
const auto data = llama_perf_sampler(chain);
- LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
- __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
+ LLAMA_LOG_INFO("%s: samplers time = %10.2f ms / %5d runs\n", __func__, data.t_sample_ms, data.n_sample);
}
void llama_perf_sampler_reset(struct llama_sampler * chain) {
auto * ctx = (struct llama_sampler_chain *) chain->ctx;
- ctx->t_sample_us = ctx->n_sample = 0;
+ ctx->t_sample_us = 0;
+ ctx->n_sample = 0;
}