{
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
- const bool supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
+ supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
if (!supports_set_rows && !cparams.kv_unified) {
LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
}
}
+ if (!supports_set_rows) {
+ // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
+ // overlap with device computation.
+ ggml_backend_sched_reset(sched.get());
+ }
+
// TODO: hacky solution
if (model.arch == LLM_ARCH_T5 && t_embd) {
//cross.t_embd = t_embd;
// wait for the computation to finish (automatically done when obtaining the model output)
//synchronize();
+ if (!supports_set_rows) {
+ // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
+ // overlap with device computation.
+ ggml_backend_sched_reset(sched.get());
+ }
+
return 0;
}
bool has_evaluated_once = false;
+ // env: LLAMA_SET_ROWS (temporary)
+ // ref: https://github.com/ggml-org/llama.cpp/pull/14285
+ bool supports_set_rows = false;
+
// perf
mutable int64_t t_start_us = 0;
mutable int64_t t_load_us = 0;