ctx_bufs.emplace_back(ctx, buf);
}
- // print memory requirements
- {
+ if (llama_supports_gpu_offload()) {
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
const int max_offloadable_layers = hparams.n_layer + 1;
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
+ }
- for (ggml_backend_buffer_t buf : model.bufs) {
- LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
- }
+ // print memory requirements
+ for (ggml_backend_buffer_t buf : model.bufs) {
+ LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
}
// populate tensors_by_name