params.embedding = true;
+ // if the number of prompts that would be encoded is known in advance, it's more efficient to specify the
+ // --parallel argument accordingly. for convenience, if not specified, we fallback to unified KV cache
+ // in order to support any number of prompts
+ if (params.n_parallel == 1) {
+ LOG_INF("%s: n_parallel == 1 -> unified KV cache is enabled\n", __func__);
+ params.kv_unified = true;
+ }
+
// utilize the full context
if (params.n_batch < params.n_ctx) {
LOG_WRN("%s: setting batch size to %d\n", __func__, params.n_ctx);
return 1;
}
+ if (params.n_parallel == 1) {
+ // the example uses 2 sequences, so when n_parallel == 1, we need to enable unified kv cache
+ printf("%s: n_parallel == 1, enabling unified kv cache\n", __func__);
+ params.kv_unified = true;
+ }
+
common_init();
if (params.n_predict < 0) {
for (int32_t i = 0; i < batch.n_tokens; ++i) {
for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= (llama_seq_id) n_seq_max)) {
- LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], (llama_seq_id) n_seq_max);
+ LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d >= %d\n", __func__, i, s, batch.seq_id[i][s], (llama_seq_id) n_seq_max);
return false;
}
}
llama_build_and_test(test-log.cpp)
llama_build_and_test(test-regex-partial.cpp)
-llama_build_and_test(test-thread-safety.cpp ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4)
+llama_build_and_test(test-thread-safety.cpp ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4 -t 2)
# this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135)
if (NOT WIN32)
auto cparams = common_context_params_to_llama(params);
+ // each context has a single sequence
+ cparams.n_seq_max = 1;
+
int dev_count = ggml_backend_dev_count();
int gpu_dev_count = 0;
for (int i = 0; i < dev_count; ++i) {