server : fix incorrect usage of llama_get_embeddings() (#14225)

author Georgi Gerganov <redacted>

Mon, 16 Jun 2025 19:33:27 +0000 (22:33 +0300)

committer GitHub <redacted>

Mon, 16 Jun 2025 19:33:27 +0000 (22:33 +0300)
author Georgi Gerganov <redacted>
Mon, 16 Jun 2025 19:33:27 +0000 (22:33 +0300)
committer GitHub <redacted>
Mon, 16 Jun 2025 19:33:27 +0000 (22:33 +0300)
diff --git a/include/llama.h b/include/llama.h

index b086b68e6d4ea40ca6af8143f68425342be0a87c..635508b10f2ff1a2820ca98b15d26992b539f495 100644 (file)
--- a/include/llama.h
+++ b/include/llama.h
@@ -965,6 +965,7 @@ extern "C" {
      LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
  
      // Set whether the context outputs embeddings or not
+    // TODO: rename to avoid confusion with llama_get_embeddings()
      LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
  
      // Set whether to use causal attention or not
diff --git a/tools/server/server.cpp b/tools/server/server.cpp

index c08e421255fcee5d25ab1f483404a873f79b3ace..721d09182845d4bc0bc2c893807958950ef46205 100644 (file)
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -1358,6 +1358,14 @@ struct server_slot {
          return server_task_type_need_logits(task_type);
      }
  
+    // if the context does not have a memory module then all embeddings have to be computed within a single ubatch
+    // also we cannot split if the pooling would require any past tokens
+    bool can_split() const {
+        return
+            !need_embd() ||
+            (llama_get_memory(ctx) && llama_pooling_type(ctx) == LLAMA_POOLING_TYPE_LAST);
+    }
+
      bool can_batch_with(server_slot & other_slot) const {
          return task_type == other_slot.task_type && are_lora_equal(lora, other_slot.lora);
      }
@@ -1929,14 +1937,6 @@ struct server_context {
          llama_batch_free(batch);
      }
  
-    // if the context does not have a memory module then all embeddings have to be computed within a single ubatch
-    // also we cannot split if the pooling would require any past tokens
-    bool can_split() const {
-        return
-            !llama_get_embeddings(ctx) ||
-            (llama_get_memory(ctx) && llama_pooling_type(ctx) == LLAMA_POOLING_TYPE_LAST);
-    }
-
      bool load_model(const common_params & params) {
          SRV_INF("loading model '%s'\n", params.model.path.c_str());
  
@@ -3130,7 +3130,7 @@ struct server_context {
                              continue;
                          }
  
-                        if (!can_split()) {
+                        if (!slot.can_split()) {
                              if (slot.n_prompt_tokens > n_ubatch) {
                                  slot.release();
                                  send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
@@ -3273,7 +3273,7 @@ struct server_context {
                          slot.n_prompt_tokens_processed = 0;
                      }
  
-                    if (!can_split()) {
+                    if (!slot.can_split()) {
                          // cannot fit the prompt in the current batch - will try next iter
                          if (batch.n_tokens + slot.n_prompt_tokens > n_batch) {
                              continue;
author	Georgi Gerganov <redacted>
	Mon, 16 Jun 2025 19:33:27 +0000 (22:33 +0300)
committer	GitHub <redacted>
	Mon, 16 Jun 2025 19:33:27 +0000 (22:33 +0300)
include/llama.h		patch \| blob \| history
tools/server/server.cpp		patch \| blob \| history