llama : improve output buffer type selection (#10098)

author Diego Devesa <redacted>

Thu, 31 Oct 2024 23:49:53 +0000 (00:49 +0100)

committer GitHub <redacted>

Thu, 31 Oct 2024 23:49:53 +0000 (00:49 +0100)
author Diego Devesa <redacted>
Thu, 31 Oct 2024 23:49:53 +0000 (00:49 +0100)
committer GitHub <redacted>
Thu, 31 Oct 2024 23:49:53 +0000 (00:49 +0100)
diff --git a/src/llama.cpp b/src/llama.cpp

index ed3998a1fb18201611799d2b2fe254a8a8b64f17..ca0d259b21fb9802daa1e0cd255477c157782e57 100644 (file)
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -17162,18 +17162,10 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
  
          auto * buft = ggml_backend_cpu_buffer_type();
          // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
-        ggml_tensor * output_tensor = lctx.model.output;
-        if (!output_tensor) {
-            // bert models don't have an output tensor, use the last layer
-            output_tensor = lctx.model.layers.back().layer_out_norm;
-        }
-        if (output_tensor) {
-            auto * output_buft = ggml_backend_buffer_get_type(output_tensor->buffer);
-            auto * output_dev = ggml_backend_buft_get_device(output_buft);
-            auto * output_dev_host_buft = ggml_backend_dev_host_buffer_type(output_dev);
-            if (output_dev_host_buft) {
-                buft = output_dev_host_buft;
-            }
+        auto * output_dev = lctx.model.dev_output.dev;
+        auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
+        if (output_dev_host_buft) {
+            buft = output_dev_host_buft;
          }
          lctx.buf_output = ggml_backend_buft_alloc_buffer(buft, new_size);
          if (lctx.buf_output == nullptr) {
author	Diego Devesa <redacted>
	Thu, 31 Oct 2024 23:49:53 +0000 (00:49 +0100)
committer	GitHub <redacted>
	Thu, 31 Oct 2024 23:49:53 +0000 (00:49 +0100)