//#define GGML_ALLOCATOR_DEBUG
-//#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
+//#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__)
#define AT_PRINTF(...)
size = GGML_PAD(size, talloc->alignment);
if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
- fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
+ GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
GGML_ABORT("not enough space in the buffer");
}
best_fit_block = alloc->n_free_blocks - 1;
} else {
// this should never happen
- fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
+ GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
__func__, size, max_avail);
GGML_ABORT("not enough space in the buffer");
}
}
}
}
- fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
+ GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
for (int i = 0; i < 1024; i++) {
if (alloc->allocated_tensors[i].tensor) {
- fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
+ GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
alloc->allocated_tensors[i].offset,
alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
}
}
- fprintf(stderr, "\n");
+ GGML_LOG_DEBUG("\n");
}
#endif
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
if (new_size > cur_size || galloc->buffers[i] == NULL) {
#ifndef NDEBUG
- fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+ GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
#endif
ggml_backend_buffer_free(galloc->buffers[i]);
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
if (galloc->buffers[i] == NULL) {
- fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
+ GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
return false;
}
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
if (galloc->n_nodes != graph->n_nodes) {
#ifndef NDEBUG
- fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
+ GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
#endif
return true;
}
if (galloc->n_leafs != graph->n_leafs) {
#ifndef NDEBUG
- fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
+ GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__);
#endif
return true;
}
if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
#ifndef NDEBUG
- fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
+ GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name);
#endif
return true;
}
}
if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
#ifndef NDEBUG
- fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
+ GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
#endif
return true;
}
if (ggml_gallocr_needs_realloc(galloc, graph)) {
if (galloc->n_buffers == 1) {
#ifndef NDEBUG
- fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
+ GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
#endif
if (!ggml_gallocr_reserve(galloc, graph)) {
return false;
}
} else {
#ifndef NDEBUG
- fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
+ GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
#endif
return false;
}
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
if (buffer == NULL) {
#ifndef NDEBUG
- fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
+ GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
#endif
for (size_t i = 0; i < *n_buffers; i++) {
ggml_backend_buffer_free((*buffers)[i]);
}
if (this_size > max_size) {
- fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
+ GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
__func__, t->name,
ggml_backend_buft_name(buft),
this_size, max_size);
if (n_buffers == 0) {
#ifndef NDEBUG
- fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
+ GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
#endif
return NULL;
}
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
} else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
#ifndef NDEBUG
- fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
+ GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
#endif
size_t nbytes = ggml_nbytes(src);
void * data = malloc(nbytes);
void register_backend(ggml_backend_reg_t reg) {
#ifndef NDEBUG
- fprintf(stderr, "%s: registered backend %s (%zu devices)\n",
+ GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
#endif
backends.push_back(reg);
void register_device(ggml_backend_dev_t device) {
#ifndef NDEBUG
- fprintf(stderr, "%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
+ GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
#endif
devices.push_back(device);
}
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
if (data == NULL) {
- fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
+ GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
return NULL;
}
void * ptr;
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
if (result != 0) {
- fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
+ GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
return NULL;
}
}
#ifndef NDEBUG
- fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
+ GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
__func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
#endif
for (int i = 0; i < graph->n_nodes; i++) {
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
- fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
+ GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
sched->splits[cur_split].n_inputs);
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
- fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
+ GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
}
- fprintf(stderr, "\n");
+ GGML_LOG_DEBUG("\n");
cur_split++;
}
struct ggml_tensor * node = graph->nodes[i];
continue;
}
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
- fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
+ GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
continue;
}
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
- fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
+ GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
}
- fprintf(stderr, "\n");
+ GGML_LOG_DEBUG("\n");
}
}
// the re-allocation may cause the split inputs to be moved to a different address
ggml_backend_sched_synchronize(sched);
#ifndef NDEBUG
- fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
+ GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
#endif
ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
- fprintf(stderr, "%s: failed to allocate graph\n", __func__);
+ GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
return false;
}
}
struct ggml_context * ctx_unallocated = ggml_init(params);
if (ctx_allocated == NULL || ctx_unallocated == NULL) {
- fprintf(stderr, "failed to allocate context for graph copy\n");
+ GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__);
ggml_hash_set_free(&hash_set);
free(node_copies);
free(node_init);
// allocate nodes
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
if (buffer == NULL) {
- fprintf(stderr, "failed to allocate buffer for graph copy\n");
+ GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__);
ggml_hash_set_free(&hash_set);
free(node_copies);
free(node_init);
/* .context = */ ctx,
};
-#if !defined(NDEBUG) && defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
+#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
if (openblas_get_parallel() != OPENBLAS_OPENMP) {
- fprintf(stderr, "%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
+ GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
}
#endif
-#if !defined(NDEBUG) && defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP)
- fprintf(stderr, "%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__);
+#if defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP)
+ GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__);
#endif
return backend;
return;
}
}
- GGML_LOG_WARN(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
+ GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
ggml_cuda_set_device(device);
CUDA_CHECK(cudaFree(ptr));
pool_size -= size;
if (err != cudaSuccess) {
// clear the error
cudaGetLastError();
- GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
+ GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
size / 1024.0 / 1024.0, cudaGetErrorString(err));
return nullptr;
}
if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
#ifndef NDEBUG
- GGML_LOG_WARN("%s: backend and buffer devices do not match\n", __func__);
+ GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
#endif
return false;
}
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
#ifndef NDEBUG
- GGML_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
+ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
#endif
}
}
if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
#ifndef NDEBUG
- GGML_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
+ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__);
#endif
}
if (node->op == GGML_OP_MUL_MAT_ID) {
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
#ifndef NDEBUG
- GGML_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
+ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
#endif
}
// Changes in batch size or context size can cause changes to the grid size of some kernels.
use_cuda_graph = false;
#ifndef NDEBUG
- GGML_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
+ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
#endif
}
if (!ptr) {
use_cuda_graph = false;
#ifndef NDEBUG
- GGML_LOG_WARN("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
+ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
#endif
} else {
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
#ifndef NDEBUG
- GGML_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
+ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
#endif
}
}
use_cuda_graph = false;
cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
#ifndef NDEBUG
- GGML_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
+ GGML_LOG_DEBUG("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
#endif
} else {
graph_evaluated_or_captured = true; // CUDA graph has been captured
// clear the error
cudaGetLastError();
- GGML_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
+ GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
size / 1024.0 / 1024.0, cudaGetErrorString(err));
return false;
}