}
cparams.flash_attn = params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED;
+ cparams.auto_fa = params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO;
// with causal attention, the batch size is limited by the context size
cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
cparams.op_offload = params.op_offload;
cparams.kv_unified = params.kv_unified;
+ // intialized later
+ cparams.pipeline_parallel = false;
+
{
const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable;
LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
- const uint32_t n_seqs = cparams.n_seq_max;
- const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
-
- const size_t max_nodes = this->graph_max_nodes(n_tokens);
-
- LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
-
- gf_res_prev.reset(new llm_graph_result(max_nodes));
- gf_res_reserve.reset(new llm_graph_result(max_nodes));
-
// TODO: move these checks to ggml_backend_sched
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
bool pipeline_parallel =
}
}
- sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel, cparams.op_offload));
+ cparams.pipeline_parallel = pipeline_parallel;
- if (pipeline_parallel) {
+ if (cparams.pipeline_parallel) {
LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
}
- llama_memory_context_ptr mctx;
- if (memory) {
- LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
- mctx = memory->init_full();
- if (!mctx) {
- throw std::runtime_error("failed to initialize memory module");
+ sched_reserve();
+
+ if (!cparams.flash_attn) {
+ if (ggml_is_quantized(params.type_v)) {
+ throw std::runtime_error("quantized V cache was requested, but this requires Flash Attention");
}
}
+ }
- cross.v_embd.clear();
-
- // avoid reserving graphs with zero outputs - assume one output per sequence
- n_outputs = n_seqs;
-
- LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
+ // Initialize the full vocabulary token ids for backend samplers.
+ {
+ const int n_vocab = model.vocab.n_tokens();
- // resolve automatic Flash Attention use
- if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO) {
- auto * gf = graph_reserve(1, n_seqs, n_outputs, mctx.get(), true);
- if (!gf) {
- throw std::runtime_error("failed to split graph for Flash Attention check");
- }
+ sampling.token_ids_full_vocab.resize(n_vocab);
+ for (int i = 0; i < n_vocab; ++i) {
+ sampling.token_ids_full_vocab[i] = i;
+ }
+ }
+}
- const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FATTN) + 1;
- bool fa_device_mismatch = false;
- for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
- ggml_tensor * n = ggml_graph_node(gf, i);
- if (n->op != GGML_OP_FLASH_ATTN_EXT) {
- continue;
- }
- ggml_backend_dev_t device_fa = ggml_backend_get_device(
- ggml_backend_sched_get_tensor_backend(sched.get(), n));
+llama_context::~llama_context() {
+ if (!model.hparams.no_alloc) {
+ for (size_t i = 0; i < backend_ptrs.size(); ++i) {
+ ggml_backend_t backend = backend_ptrs[i];
+ ggml_backend_buffer_type_t buft = backend_buft[i];
- // TODO: instead of the tensor names, use a map to keep track of which (FA) tensors belong to which layer
- GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FATTN "-", prefix_len) == 0);
- const int il = std::stoi(n->name + prefix_len);
- ggml_backend_dev_t device_kv = model.dev_layer(il);
- if (device_fa != device_kv) {
- LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the Flash Attention tensor "
- "is assigned to device %s (usually due to missing support)\n",
- __func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_fa));
- // FIXME: fa_device_mismatch logic is wrong for --no-kv-offload, but this is broken anyways
- fa_device_mismatch = true;
- break;
- }
- }
- if (fa_device_mismatch) {
- cparams.flash_attn = false;
- LLAMA_LOG_WARN("%s: Flash Attention was auto, set to disabled\n", __func__);
- if (ggml_is_quantized(params.type_v)) {
- throw std::runtime_error("quantized V cache was requested, but this requires Flash Attention");
- }
+ const size_t size_exp = backend_buf_exp_size[i];
+ const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
+ if (size_exp == size_act) {
+ LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
+ __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
} else {
- cparams.flash_attn = true;
- LLAMA_LOG_INFO("%s: Flash Attention was auto, set to enabled\n", __func__);
+ LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
+ __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
}
}
+ }
+ ggml_opt_free(opt_ctx);
+}
- // reserve worst-case graph
- int n_splits_pp = -1;
- int n_nodes_pp = -1;
+void llama_context::sched_reserve() {
+ if (!sched_need_reserve) {
+ return;
+ }
- int n_splits_tg = -1;
- int n_nodes_tg = -1;
+ sched_need_reserve = false;
- // reserve pp (prompt processing) graph first so that buffers are only allocated once
- {
- auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(),
- model.hparams.no_alloc, model.hparams.no_alloc ? backend_buf_exp_size.data() : nullptr);
- if (!gf) {
- if (pipeline_parallel) {
- LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
- sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, false, cparams.op_offload));
- gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
- }
- if (!gf) {
- throw std::runtime_error("failed to allocate compute pp buffers");
- }
- }
+ LLAMA_LOG_INFO("%s: reserving ...\n", __func__);
+
+ synchronize();
+
+ const int64_t t_start_us = ggml_time_us();
+
+ const uint32_t n_seqs = cparams.n_seq_max;
+ const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
- n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
- n_nodes_pp = ggml_graph_n_nodes(gf);
+ const size_t max_nodes = this->graph_max_nodes(n_tokens);
+
+ LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
+
+ gf_res_prev.reset(new llm_graph_result(max_nodes));
+ gf_res_reserve.reset(new llm_graph_result(max_nodes));
+
+ sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, cparams.pipeline_parallel, cparams.op_offload));
+
+ llama_memory_context_ptr mctx;
+ if (memory) {
+ LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
+ mctx = memory->init_full();
+ if (!mctx) {
+ throw std::runtime_error("failed to initialize memory module");
}
+ }
- // reserve with tg (token generation) graph to get the number of splits and nodes
- {
- auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get(), model.hparams.no_alloc);
- if (!gf) {
- throw std::runtime_error("failed to allocate compute tg buffers");
- }
+ // avoid reserving graphs with zero outputs - assume one output per sequence
+ const int n_outputs = n_seqs;
- n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
- n_nodes_tg = ggml_graph_n_nodes(gf);
+ LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
+
+ // resolve automatic Flash Attention use
+ if (cparams.auto_fa) {
+ auto * gf = graph_reserve(1, n_seqs, n_outputs, mctx.get(), true);
+ if (!gf) {
+ throw std::runtime_error("failed to split graph for Flash Attention check");
}
- // reserve again with pp graph to avoid ggml-alloc reallocations during inference
- {
- // TODO: not sure if the following graph would be worster case for multi-stream KV caches:
- //
- // auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
- //
- auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), model.hparams.no_alloc);
- if (!gf) {
- throw std::runtime_error("failed to allocate compute pp buffers");
+ const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FATTN) + 1;
+ bool fa_device_mismatch = false;
+ for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
+ ggml_tensor * n = ggml_graph_node(gf, i);
+ if (n->op != GGML_OP_FLASH_ATTN_EXT) {
+ continue;
+ }
+ ggml_backend_dev_t device_fa = ggml_backend_get_device(
+ ggml_backend_sched_get_tensor_backend(sched.get(), n));
+
+ // TODO: instead of the tensor names, use a map to keep track of which (FA) tensors belong to which layer
+ GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FATTN "-", prefix_len) == 0);
+ const int il = std::stoi(n->name + prefix_len);
+ ggml_backend_dev_t device_kv = model.dev_layer(il);
+ if (device_fa != device_kv) {
+ LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the Flash Attention tensor "
+ "is assigned to device %s (usually due to missing support)\n",
+ __func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_fa));
+ // FIXME: fa_device_mismatch logic is wrong for --no-kv-offload, but this is broken anyways
+ fa_device_mismatch = true;
+ break;
}
}
+ if (fa_device_mismatch) {
+ cparams.flash_attn = false;
+ LLAMA_LOG_WARN("%s: Flash Attention was auto, set to disabled\n", __func__);
+ } else {
+ cparams.flash_attn = true;
+ LLAMA_LOG_INFO("%s: Flash Attention was auto, set to enabled\n", __func__);
+ }
- for (size_t i = 0; i < backend_ptrs.size(); ++i) {
- ggml_backend_t backend = backend_ptrs[i];
- ggml_backend_buffer_type_t buft = backend_buft[i];
- if (!model.hparams.no_alloc) {
- backend_buf_exp_size[i] = ggml_backend_sched_get_buffer_size(sched.get(), backend);
+ cparams.auto_fa = false;
+ }
+
+ // reserve worst-case graph
+ int n_splits_pp = -1;
+ int n_nodes_pp = -1;
+
+ int n_splits_tg = -1;
+ int n_nodes_tg = -1;
+
+ // reserve pp (prompt processing) graph first so that buffers are only allocated once
+ {
+ auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(),
+ model.hparams.no_alloc, model.hparams.no_alloc ? backend_buf_exp_size.data() : nullptr);
+ if (!gf) {
+ if (cparams.pipeline_parallel) {
+ LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
+ cparams.pipeline_parallel = false;
+ sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, false, cparams.op_offload));
+ gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
}
- if (backend_buf_exp_size[i] > 1) {
- LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
- ggml_backend_buft_name(buft),
- backend_buf_exp_size[i] / 1024.0 / 1024.0);
+ if (!gf) {
+ throw std::runtime_error("failed to allocate compute pp buffers");
}
}
- if (n_nodes_pp == n_nodes_tg) {
- LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, n_nodes_pp);
- } else {
- LLAMA_LOG_INFO("%s: graph nodes = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg);
- }
+ n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
+ n_nodes_pp = ggml_graph_n_nodes(gf);
+ }
- if (n_splits_pp == n_splits_tg) {
- LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp);
- } else {
- LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
+ // reserve with tg (token generation) graph to get the number of splits and nodes
+ {
+ auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get(), model.hparams.no_alloc);
+ if (!gf) {
+ throw std::runtime_error("failed to allocate compute tg buffers");
}
+
+ n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
+ n_nodes_tg = ggml_graph_n_nodes(gf);
}
- // Initialize the full vocabulary token ids for backend samplers.
+ // reserve again with pp graph to avoid ggml-alloc reallocations during inference
{
- const int n_vocab = model.vocab.n_tokens();
+ // TODO: not sure if the following graph would be worster case for multi-stream KV caches:
+ //
+ // auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
+ //
+ auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), model.hparams.no_alloc);
+ if (!gf) {
+ throw std::runtime_error("failed to allocate compute pp buffers");
+ }
+ }
- sampling.token_ids_full_vocab.resize(n_vocab);
- for (int i = 0; i < n_vocab; ++i) {
- sampling.token_ids_full_vocab[i] = i;
+ for (size_t i = 0; i < backend_ptrs.size(); ++i) {
+ ggml_backend_t backend = backend_ptrs[i];
+ ggml_backend_buffer_type_t buft = backend_buft[i];
+ if (!model.hparams.no_alloc) {
+ backend_buf_exp_size[i] = ggml_backend_sched_get_buffer_size(sched.get(), backend);
+ }
+ if (backend_buf_exp_size[i] > 1) {
+ LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
+ ggml_backend_buft_name(buft),
+ backend_buf_exp_size[i] / 1024.0 / 1024.0);
}
}
-}
-llama_context::~llama_context() {
- if (!model.hparams.no_alloc) {
- for (size_t i = 0; i < backend_ptrs.size(); ++i) {
- ggml_backend_t backend = backend_ptrs[i];
- ggml_backend_buffer_type_t buft = backend_buft[i];
+ if (n_nodes_pp == n_nodes_tg) {
+ LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, n_nodes_pp);
+ } else {
+ LLAMA_LOG_INFO("%s: graph nodes = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg);
+ }
- const size_t size_exp = backend_buf_exp_size[i];
- const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend);
- if (size_exp == size_act) {
- LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n",
- __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
- } else {
- LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n",
- __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0));
- }
- }
+ if (n_splits_pp == n_splits_tg) {
+ LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp);
+ } else {
+ LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
}
- ggml_opt_free(opt_ctx);
+
+ const int64_t t_end_us = ggml_time_us();
+
+ LLAMA_LOG_INFO("%s: reserve took %.2f ms\n", __func__, (t_end_us - t_start_us)/1000.0);
}
void llama_context::synchronize() {
+ if (!sched) {
+ return;
+ }
+
ggml_backend_sched_synchronize(sched.get());
// FIXME: if multiple single tokens are evaluated without a synchronization,
LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
cparams.embeddings = value;
+
+ // TODO: not sure yet if we want to reserve here
+ //sched_need_reserve = true;
}
void llama_context::set_causal_attn(bool value) {
LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
+ if (cparams.causal_attn == value) {
+ return;
+ }
+
cparams.causal_attn = value;
+
+ sched_need_reserve = true;
}
void llama_context::set_warmup(bool value) {
LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
+ if (cparams.warmup == value) {
+ return;
+ }
+
cparams.warmup = value;
+
+ sched_need_reserve = true;
}
bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) {
+ if (!sampler && sampling.samplers.count(seq_id) == 0) {
+ return true;
+ }
+
LLAMA_LOG_DEBUG("%s: seq_id = %d, sampler = %p\n", __func__, (int) seq_id, (void *) sampler);
const bool can_offload =
sampling.samplers[seq_id] = sampler;
+ sched_need_reserve = true;
+
return true;
}
if (sampler && !can_offload) {
LLAMA_LOG_WARN("%s: sampler '%s' for seq_id = %d, cannot be offloaded to the backend\n", __func__, llama_sampler_name(sampler), seq_id);
+ if (sampling.samplers.count(seq_id) > 0) {
+ sched_need_reserve = true;
+ }
+
sampling.samplers.erase(seq_id);
return false;
sampling.samplers.erase(seq_id);
+ sched_need_reserve = true;
+
return true;
}
float scale) {
LLAMA_LOG_DEBUG("%s: adapter = %p, scale = %f\n", __func__, (void *) adapter, scale);
+ if (auto it = loras.find(adapter); it != loras.end()) {
+ if (it->second == scale) {
+ return;
+ }
+ }
+
loras[adapter] = scale;
+
+ sched_need_reserve = true;
}
bool llama_context::rm_adapter_lora(
llama_adapter_lora * adapter) {
LLAMA_LOG_DEBUG("%s: adapter = %p\n", __func__, (void *) adapter);
- auto pos = loras.find(adapter);
- if (pos != loras.end()) {
- loras.erase(pos);
+ auto it = loras.find(adapter);
+ if (it != loras.end()) {
+ loras.erase(it);
+
+ sched_need_reserve = true;
+
return true;
}
void llama_context::clear_adapter_lora() {
LLAMA_LOG_DEBUG("%s: call\n", __func__);
+ if (loras.empty()) {
+ return;
+ }
+
loras.clear();
+
+ sched_need_reserve = true;
}
bool llama_context::apply_adapter_cvec(
int32_t il_end) {
LLAMA_LOG_DEBUG("%s: il_start = %d, il_end = %d\n", __func__, il_start, il_end);
+ // TODO: should we reserve?
+
return cvec.apply(model, data, len, n_embd, il_start, il_end);
}
// TODO: this clear of the buffer can easily be forgotten - need something better
embd_seq.clear();
+ sched_reserve();
+
n_queued_tokens += n_tokens;
// reserve output buffer
auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd();
// extract logits
- if (logits && t_logits) {
+ if (logits && t_logits) {
ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
GGML_ASSERT(backend_res != nullptr);
GGML_ASSERT(logits != nullptr);
embd_seq.clear();
output_swaps.clear();
+ sched_reserve();
+
bool did_optimize = false;
// handle any pending shifts/copies
SERVER_STATE_READY, // Server is ready and model is loaded
};
-static bool server_task_type_need_embd(server_task_type task_type) {
- switch (task_type) {
- case SERVER_TASK_TYPE_EMBEDDING:
- case SERVER_TASK_TYPE_RERANK:
- return true;
- default:
- return false;
- }
-}
-
-static bool server_task_type_need_logits(server_task_type task_type) {
- switch (task_type) {
- case SERVER_TASK_TYPE_COMPLETION:
- case SERVER_TASK_TYPE_INFILL:
- return true;
- default:
- return false;
- }
-}
-
struct server_slot {
int id;
return res;
}
+ void prompt_clear(bool allow_processing) {
+ if (!allow_processing) {
+ GGML_ASSERT(!is_processing());
+ }
+
+ SLT_INF(*this, "clearing prompt with %zu tokens\n", prompt.tokens.size());
+
+ llama_memory_seq_rm(llama_get_memory(ctx), id, -1, -1);
+ prompt.tokens.clear();
+ }
+
std::vector<common_adapter_lora_info> lora;
int32_t alora_invocation_start = -1;
n_draft_total = 0;
n_draft_accepted = 0;
+ task_prev = std::move(task);
task.reset();
- task_prev.reset();
+
+ llama_set_sampler(ctx, id, nullptr);
// clear alora start
alora_invocation_start = -1;
}
- // remove cached prompt + tokens
- void clear(bool allow_processing) {
- if (!allow_processing) {
- GGML_ASSERT(!is_processing());
- }
-
- SLT_INF(*this, "clearing slot with %zu tokens\n", prompt.tokens.size());
+ void init_sampler() const {
+ common_sampler_reset(smpl.get());
- llama_memory_seq_rm(llama_get_memory(ctx), id, -1, -1);
- prompt.tokens.clear();
- }
+ if (!task->need_sampling()) {
+ return;
+ }
- void init_sampler() const {
const int64_t t_start = ggml_time_us();
- common_sampler_reset(smpl.get());
-
int n_text = 0;
for (int i = 0; i < (int) prompt.tokens.size(); i++) {
(ggml_time_us() - t_start) / 1000.0, n_text, (int) prompt.tokens.size());
}
- // TODO: move to server_task
- bool need_embd() const {
- GGML_ASSERT(task);
-
- return server_task_type_need_embd(task->type);
- }
-
- // TODO: move to server_task
- bool need_logits() const {
- GGML_ASSERT(task);
-
- return server_task_type_need_logits(task->type);
- }
-
// if the context does not have a memory module then all embeddings have to be computed within a single ubatch
// also we cannot split if the pooling would require any past tokens
bool can_split() const {
+ GGML_ASSERT(task);
+
return
- !need_embd() ||
+ !task->need_embd() ||
(llama_get_memory(ctx) && llama_pooling_type(ctx) == LLAMA_POOLING_TYPE_LAST);
}
// do not keep context of the child slots - the parent's context is enough
if (is_child()) {
- clear(false);
+ prompt_clear(false);
}
- task_prev = std::move(task);
- task.reset();
+ reset();
callback_on_release(id);
}
slots.clear();
+ // initialize slots
for (int i = 0; i < params_base.n_parallel; i++) {
server_slot slot;
ret->prompt_save(*prompt_cache);
if (!ret->prompt_load(*prompt_cache, task.tokens)) {
- ret->clear(false);
+ ret->prompt_clear(false);
}
prompt_cache->update();
if (slot.prompt.n_tokens() > 0) {
SRV_WRN("purging slot %d with %zu tokens\n", slot.id, slot.prompt.tokens.size());
- slot.clear(false);
+ slot.prompt_clear(false);
res = true;
}
bool launch_slot_with_task(server_slot & slot, server_task && task) {
- slot.reset();
-
// process per-request lora adapters
if (!task.params.lora.empty()) {
auto task_loras = construct_lora_list(task.params.lora);
SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
// initialize samplers
- {
+ if (task.need_sampling()) {
slot.smpl.reset(common_sampler_init(model, task.params.sampling));
if (slot.smpl == nullptr) {
}
SLT_INF(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl.get()).c_str());
+ } else {
+ slot.smpl.reset();
}
// initialize draft batch
// Erase token cache
const size_t n_erased = slot->prompt.tokens.size();
- slot->clear(false);
+ slot->prompt_clear(false);
auto res = std::make_unique<server_task_result_slot_erase>();
res->id = task.id;
}
// TODO: support memory-less logits computation
- if (slot.need_logits() && !llama_get_memory(ctx)) {
+ if (slot.task->need_logits() && !llama_get_memory(ctx)) {
send_error(slot, "the current context does not logits computation. skipping", ERROR_TYPE_SERVER);
slot.release();
continue;
if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) {
SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0);
- slot.clear(true);
+ slot.prompt_clear(true);
// there is no common part left
slot.n_prompt_tokens_cache = 0;
cur_tok,
slot.prompt.tokens.pos_next(),
{ slot.id },
- slot.need_embd());
+ slot.task->need_embd());
slot.prompt.tokens.push_back(cur_tok);
slot.n_prompt_tokens_processed++;
slot_batched->lora[alora_disabled_id].scale = alora_scale;
}
- llama_set_embeddings(ctx, slot_batched->need_embd());
+ llama_set_embeddings(ctx, slot_batched->task->need_embd());
}
if (batch.n_tokens == 0) {
// note: it's complicated to keep track of how much of the current batch has been
// processed before the error occurred, so we simply clear the entire context
- slot.clear(false);
+ slot.prompt_clear(false);
}
}
continue; // continue loop of slots
}
+ GGML_ASSERT(slot.task->need_sampling());
+
// prompt evaluated for next-token prediction
slot.state = SLOT_STATE_GENERATING;
} else if (slot.state != SLOT_STATE_GENERATING) {