LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
// Set whether the context outputs embeddings or not
+ // TODO: rename to avoid confusion with llama_get_embeddings()
LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
// Set whether to use causal attention or not
return server_task_type_need_logits(task_type);
}
+ // if the context does not have a memory module then all embeddings have to be computed within a single ubatch
+ // also we cannot split if the pooling would require any past tokens
+ bool can_split() const {
+ return
+ !need_embd() ||
+ (llama_get_memory(ctx) && llama_pooling_type(ctx) == LLAMA_POOLING_TYPE_LAST);
+ }
+
bool can_batch_with(server_slot & other_slot) const {
return task_type == other_slot.task_type && are_lora_equal(lora, other_slot.lora);
}
llama_batch_free(batch);
}
- // if the context does not have a memory module then all embeddings have to be computed within a single ubatch
- // also we cannot split if the pooling would require any past tokens
- bool can_split() const {
- return
- !llama_get_embeddings(ctx) ||
- (llama_get_memory(ctx) && llama_pooling_type(ctx) == LLAMA_POOLING_TYPE_LAST);
- }
-
bool load_model(const common_params & params) {
SRV_INF("loading model '%s'\n", params.model.path.c_str());
continue;
}
- if (!can_split()) {
+ if (!slot.can_split()) {
if (slot.n_prompt_tokens > n_ubatch) {
slot.release();
send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
slot.n_prompt_tokens_processed = 0;
}
- if (!can_split()) {
+ if (!slot.can_split()) {
// cannot fit the prompt in the current batch - will try next iter
if (batch.n_tokens + slot.n_prompt_tokens > n_batch) {
continue;