server : coherent log output for KV cache full (#6637)

author Pierrick Hymbert <redacted>

Fri, 12 Apr 2024 11:49:21 +0000 (13:49 +0200)

committer GitHub <redacted>

Fri, 12 Apr 2024 11:49:21 +0000 (14:49 +0300)
author Pierrick Hymbert <redacted>
Fri, 12 Apr 2024 11:49:21 +0000 (13:49 +0200)
committer GitHub <redacted>
Fri, 12 Apr 2024 11:49:21 +0000 (14:49 +0300)
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index 2e791190b740a05dd316fb1b74c6ca24a78263b6..b08a09a57bf3771af9f30d4dbe1d7884b8ad8f25 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1083,7 +1083,7 @@ struct server_context {
                  };
  
                  if (llama_decode(ctx, batch_view) != 0) {
-                    LOG_TEE("%s: llama_decode() failed\n", __func__);
+                    LOG_ERROR("llama_decode() failed", {});
                      return;
                  }
              }
@@ -1281,7 +1281,11 @@ struct server_context {
      }
  
      void send_error(const int id_task, const int id_multi, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
-        LOG_TEE("task %i - error: %s\n", id_task, error.c_str());
+        LOG_ERROR("task error", {
+            {"id_multi", id_multi},
+            {"id_task", id_task},
+            {"error", error},
+        });
  
          server_task_result res;
          res.id       = id_task;
@@ -2186,7 +2190,11 @@ struct server_context {
              if (ret != 0) {
                  if (n_batch == 1 || ret < 0) {
                      // if you get here, it means the KV cache is full - try increasing it via the context size
-                    LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
+                    LOG_ERROR("failed to decode the batch: KV cache is full - try increasing it via the context size", {
+                        {"i",   i},
+                        {"n_batch",  ret},
+                        {"ret",   ret},
+                    });
                      for (auto & slot : slots) {
                          slot.state = SLOT_STATE_PROCESSING;
                          slot.command = SLOT_COMMAND_NONE;
@@ -2196,12 +2204,16 @@ struct server_context {
                      break; // break loop of n_batch
                  }
  
-                LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
-
                  // retry with half the batch size to try to find a free slot in the KV cache
                  n_batch /= 2;
                  i -= n_batch;
  
+                LOG_WARNING("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation", {
+                    {"i",   i},
+                    {"n_batch",  n_batch},
+                    {"ret",   ret},
+                });
+
                  continue; // continue loop of n_batch
              }
author	Pierrick Hymbert <redacted>
	Fri, 12 Apr 2024 11:49:21 +0000 (13:49 +0200)
committer	GitHub <redacted>
	Fri, 12 Apr 2024 11:49:21 +0000 (14:49 +0300)