llama : do not print "offloading layers" message in CPU-only builds (#5416)

author slaren <redacted>

Thu, 8 Feb 2024 20:33:03 +0000 (21:33 +0100)

committer GitHub <redacted>

Thu, 8 Feb 2024 20:33:03 +0000 (21:33 +0100)
author slaren <redacted>
Thu, 8 Feb 2024 20:33:03 +0000 (21:33 +0100)
committer GitHub <redacted>
Thu, 8 Feb 2024 20:33:03 +0000 (21:33 +0100)
diff --git a/llama.cpp b/llama.cpp

index 89acafbc3ffd79f50917217c3dc21d16aeb8db06..db7d1c1cd18ee9cddb8b07d934e8112e61a83073 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -4209,8 +4209,7 @@ static bool llm_load_tensors(
          ctx_bufs.emplace_back(ctx, buf);
      }
  
-    // print memory requirements
-    {
+    if (llama_supports_gpu_offload()) {
          const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
  
          LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
@@ -4222,10 +4221,11 @@ static bool llm_load_tensors(
          const int max_offloadable_layers       = hparams.n_layer + 1;
  
          LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
+    }
  
-        for (ggml_backend_buffer_t buf : model.bufs) {
-            LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
-        }
+    // print memory requirements
+    for (ggml_backend_buffer_t buf : model.bufs) {
+        LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
      }
  
      // populate tensors_by_name
author	slaren <redacted>
	Thu, 8 Feb 2024 20:33:03 +0000 (21:33 +0100)
committer	GitHub <redacted>
	Thu, 8 Feb 2024 20:33:03 +0000 (21:33 +0100)