llama : do not allocate KV cache for "vocab_only == true" (#682)

author Stephan Walter <redacted>

Sun, 2 Apr 2023 07:18:53 +0000 (07:18 +0000)

committer GitHub <redacted>

Sun, 2 Apr 2023 07:18:53 +0000 (10:18 +0300)
author Stephan Walter <redacted>
Sun, 2 Apr 2023 07:18:53 +0000 (07:18 +0000)
committer GitHub <redacted>
Sun, 2 Apr 2023 07:18:53 +0000 (10:18 +0300)
diff --git a/llama.cpp b/llama.cpp

index bed24207db7760febcfe3aa2d8e5642279521ee8..1b3157cd2e5e4078a743c89e38162290894127a2 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -1608,7 +1608,7 @@ struct llama_context * llama_init_from_file(
      }
  
      // reserve memory for context buffers
-    {
+    if (!params.vocab_only) {
          if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
              fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
              llama_free(ctx);
author	Stephan Walter <redacted>
	Sun, 2 Apr 2023 07:18:53 +0000 (07:18 +0000)
committer	GitHub <redacted>
	Sun, 2 Apr 2023 07:18:53 +0000 (10:18 +0300)