// llama helpers
//
+inline void * llama_host_malloc(size_t n) {
#ifdef GGML_USE_CUBLAS
-# define llama_host_malloc(n) ggml_cuda_host_malloc(n)
-# define llama_host_free(data) ggml_cuda_host_free(data)
+ if (ggml_cublas_loaded()) {
+ return ggml_cuda_host_malloc(n);
+ } else {
+ return malloc(n);
+ }
#elif GGML_USE_METAL
-# define llama_host_malloc(n) ggml_metal_host_malloc(n)
-# define llama_host_free(data) ggml_metal_host_free(data)
+ return ggml_metal_host_malloc(n);
#elif GGML_USE_CPU_HBM
-# define llama_host_malloc(n) hbw_malloc(n)
-# define llama_host_free(data) if (data != NULL) hbw_free(data)
+ return hbw_malloc(n);
#else
-# define llama_host_malloc(n) malloc(n)
-# define llama_host_free(data) free(data)
+ return malloc(n);
#endif
+}
+
+inline void llama_host_free(void * ptr) {
+#ifdef GGML_USE_CUBLAS
+ if (ggml_cublas_loaded()) {
+ return ggml_cuda_host_free(ptr);
+ } else {
+ return free(ptr);
+ }
+#elif GGML_USE_METAL
+ return ggml_metal_host_free(ptr);
+#elif GGML_USE_CPU_HBM
+ return hbw_free(ptr);
+#else
+ return free(ptr);
+#endif
+}
#if defined(_WIN32)
static std::string llama_format_win_err(DWORD err) {
}
#ifdef GGML_USE_CUBLAS
- ggml_cuda_free_data(k);
- ggml_cuda_free_data(v);
-#endif // GGML_USE_CUBLAS
+ if (ggml_cublas_loaded()) {
+ ggml_cuda_free_data(k);
+ ggml_cuda_free_data(v);
+ }
+#endif
}
};
}
#ifdef GGML_USE_CUBLAS
- for (size_t i = 0; i < tensors_by_name.size(); ++i) {
- ggml_cuda_free_data(tensors_by_name[i].second);
+ if (ggml_cublas_loaded()) {
+ for (size_t i = 0; i < tensors_by_name.size(); ++i) {
+ ggml_cuda_free_data(tensors_by_name[i].second);
+ }
+ ggml_cuda_free_scratch();
}
- ggml_cuda_free_scratch();
-#elif defined(GGML_USE_CLBLAST)
+#endif
+
+#if defined(GGML_USE_CLBLAST)
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
ggml_cl_free_data(tensors_by_name[i].second);
}
ggml_set_name(cache.v, "cache_v");
(void) n_gpu_layers;
+
#ifdef GGML_USE_CUBLAS
- size_t vram_kv_cache = 0;
+ if (ggml_cublas_loaded()) {
+ size_t vram_kv_cache = 0;
- if (n_gpu_layers > (int)n_layer + 1) {
- ggml_cuda_assign_buffers_no_scratch(cache.v);
- LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
- vram_kv_cache += ggml_nbytes(cache.v);
- }
- if (n_gpu_layers > (int)n_layer + 2) {
- ggml_cuda_assign_buffers_no_scratch(cache.k);
- LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
- vram_kv_cache += ggml_nbytes(cache.k);
- }
- if (vram_kv_cache > 0) {
- LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
+ if (n_gpu_layers > (int)n_layer + 1) {
+ ggml_cuda_assign_buffers_no_scratch(cache.v);
+ LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
+ vram_kv_cache += ggml_nbytes(cache.v);
+ }
+ if (n_gpu_layers > (int)n_layer + 2) {
+ ggml_cuda_assign_buffers_no_scratch(cache.k);
+ LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
+ vram_kv_cache += ggml_nbytes(cache.k);
+ }
+ if (vram_kv_cache > 0) {
+ LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
+ }
}
-#endif // GGML_USE_CUBLAS
+#endif
return true;
}
}
(void) main_gpu;
+
+ enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
+ enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
+
#ifdef GGML_USE_CUBLAS
- LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
- ggml_cuda_set_main_device(main_gpu);
-#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
-#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
+ if (ggml_cublas_loaded()) {
+ LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
+ ggml_cuda_set_main_device(main_gpu);
+
+ llama_backend_offload = GGML_BACKEND_GPU;
+ llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT;
+ }
#elif defined(GGML_USE_CLBLAST)
- LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
-#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
-#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
-#else
-#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
-#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
+ LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
+ llama_backend_offload = GGML_BACKEND_GPU;
+ llama_backend_offload_split = GGML_BACKEND_GPU;
#endif
// prepare memory for the weights
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
// on Windows however this is detrimental unless everything is on the GPU
#ifndef _WIN32
- backend_norm = LLAMA_BACKEND_OFFLOAD;
+ backend_norm = llama_backend_offload;
#else
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
#endif // _WIN32
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
+ backend_output = llama_backend_offload_split;
} else {
backend_norm = GGML_BACKEND_CPU;
backend_output = GGML_BACKEND_CPU;
model.layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) {
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
auto & layer = model.layers[i];
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
// on Windows however this is detrimental unless everything is on the GPU
#ifndef _WIN32
- backend_norm = LLAMA_BACKEND_OFFLOAD;
+ backend_norm = llama_backend_offload;
#else
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
#endif // _WIN32
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
+ backend_output = llama_backend_offload_split;
} else {
backend_norm = GGML_BACKEND_CPU;
backend_output = GGML_BACKEND_CPU;
model.layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) {
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
auto & layer = model.layers[i];
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
// on Windows however this is detrimental unless everything is on the GPU
#ifndef _WIN32
- backend_norm = LLAMA_BACKEND_OFFLOAD;
+ backend_norm = llama_backend_offload;
#else
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
#endif // _WIN32
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
+ backend_output = llama_backend_offload_split;
} else {
backend_norm = GGML_BACKEND_CPU;
backend_output = GGML_BACKEND_CPU;
model.layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) {
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
auto & layer = model.layers[i];
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
// on Windows however this is detrimental unless everything is on the GPU
#ifndef _WIN32
- backend_norm = LLAMA_BACKEND_OFFLOAD;
+ backend_norm = llama_backend_offload;
#else
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
#endif // _WIN32
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
+ backend_output = llama_backend_offload_split;
} else {
backend_norm = GGML_BACKEND_CPU;
backend_output = GGML_BACKEND_CPU;
model.layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) {
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
auto & layer = model.layers[i];
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
// on Windows however this is detrimental unless everything is on the GPU
#ifndef _WIN32
- backend_norm = LLAMA_BACKEND_OFFLOAD;
+ backend_norm = llama_backend_offload;
#else
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
#endif // _WIN32
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
+ backend_output = llama_backend_offload_split;
} else {
backend_norm = GGML_BACKEND_CPU;
backend_output = GGML_BACKEND_CPU;
const int i_gpu_start = n_layer - n_gpu_layers;
model.layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) {
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload;
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split;
auto & layer = model.layers[i];
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
// on Windows however this is detrimental unless everything is on the GPU
#ifndef _WIN32
- backend_norm = LLAMA_BACKEND_OFFLOAD;
+ backend_norm = llama_backend_offload;
#else
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
#endif // _WIN32
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
+ backend_output = llama_backend_offload_split;
} else {
backend_norm = GGML_BACKEND_CPU;
backend_output = GGML_BACKEND_CPU;
model.layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) {
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
auto & layer = model.layers[i];
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
// on Windows however this is detrimental unless everything is on the GPU
#ifndef _WIN32
- backend_norm = LLAMA_BACKEND_OFFLOAD;
+ backend_norm = llama_backend_offload;
#else
- backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+ backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
#endif // _WIN32
- backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
+ backend_output = llama_backend_offload_split;
} else {
backend_norm = GGML_BACKEND_CPU;
backend_output = GGML_BACKEND_CPU;
model.layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) {
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
+ const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
+ const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
auto & layer = model.layers[i];