params.cache_type_v = kv_cache_type_from_str(value);
}
).set_env("LLAMA_ARG_CACHE_TYPE_V"));
- add_opt(common_arg(
- {"--perplexity", "--all-logits"},
- string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
- [](common_params & params) {
- params.logits_all = true;
- }
- ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
add_opt(common_arg(
{"--hellaswag"},
"compute HellaSwag score over random tasks from datafile supplied with -f",
cparams.n_threads = params.cpuparams.n_threads;
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
- cparams.logits_all = params.logits_all;
cparams.embeddings = params.embedding;
cparams.rope_scaling_type = params.rope_scaling_type;
cparams.rope_freq_base = params.rope_freq_base;
bool ctx_shift = true; // context shift on inifinite text generation
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
- bool logits_all = false; // return logits for all tokens in the batch
bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory
bool verbose_prompt = false; // print prompt tokens before generation
enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
- // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
- // TODO: move at the end of the struct
- bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
- bool embeddings; // if true, extract embeddings (together with logits)
- bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
- bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
- bool no_perf; // whether to measure performance timings
-
// Abort callback
// if it returns true, execution of llama_decode() will be aborted
// currently works only with CPU execution
ggml_abort_callback abort_callback;
void * abort_callback_data;
+
+ // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
+ bool embeddings; // if true, extract embeddings (together with logits)
+ bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
+ bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
+ bool no_perf; // whether to measure performance timings
};
// model quantization parameters
__func__, n_ctx_per_seq, hparams.n_ctx_train);
}
- logits_all = params.logits_all;
-
if (!hparams.vocab_only) {
// GPU backends
for (auto * dev : model.devices) {
for (uint32_t i = 0; i < n_tokens_all; ++i) {
n_outputs_all += batch.logits[i] != 0;
}
- } else if (logits_all || embd_pooled) {
+ } else if (embd_pooled) {
n_outputs_all = n_tokens_all;
} else {
// keep last output only
/*.cb_eval_user_data =*/ nullptr,
/*.type_k =*/ GGML_TYPE_F16,
/*.type_v =*/ GGML_TYPE_F16,
- /*.logits_all =*/ false,
+ /*.abort_callback =*/ nullptr,
+ /*.abort_callback_data =*/ nullptr,
/*.embeddings =*/ false,
/*.offload_kqv =*/ true,
/*.flash_attn =*/ false,
/*.no_perf =*/ true,
- /*.abort_callback =*/ nullptr,
- /*.abort_callback_data =*/ nullptr,
};
return result;
std::unique_ptr<llama_memory_i> memory;
- // TODO: remove
- bool logits_all = false;
-
// decode output (2-dimensional array: [n_outputs][n_vocab])
size_t logits_size = 0; // capacity (of floats) for logits
float * logits = nullptr;
params.out_file = "imatrix.dat" ;
params.n_ctx = 512;
- params.logits_all = true;
params.escape = false;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
console::init(params.simple_io, params.use_color);
atexit([]() { console::cleanup(); });
- if (params.logits_all) {
- LOG_ERR("************\n");
- LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
- LOG_ERR("************\n\n");
-
- return 0;
- }
-
if (params.embedding) {
LOG_ERR("************\n");
LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
if (int(batch_indeces.size()) != num_answers) {
batch_indeces.resize(num_answers);
}
- for (int s = 0; s < num_answers; ++s) batch_indeces[s] = s0 + s;
+
+ for (int s = 0; s < num_answers; ++s) {
+ batch_indeces[s] = s0 + s;
+ }
for (size_t i = 0; i < cur_task.common_prefix; ++i) {
//llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
common_params params;
params.n_ctx = 512;
- params.logits_all = true;
params.escape = false;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {