(void) dst;
}
+void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ ggml_cuda_cpy(src0, dst, nullptr);
+ (void) src1;
+}
+
void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_diag_mask_inf, true, true);
// recursively assign CUDA buffers until a compute tensor is found
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
const ggml_op src0_op = tensor->src[0]->op;
- if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
+ if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace);
}
}
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
switch (tensor->op) {
+ case GGML_OP_DUP:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cuda_dup;
+ break;
case GGML_OP_ADD:
if (!any_on_device) {
return false;
}
func = ggml_cuda_cpy;
break;
+ case GGML_OP_CONT:
+ if (!any_on_device) {
+ return false;
+ }
+ func = ggml_cuda_dup;
+ break;
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
if (&g_state.contexts[i].context == ctx) {
g_state.contexts[i].used = false;
- GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
- __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
+ GGML_PRINT_DEBUG("%s: context %d has been freed. memory used = %zu\n",
+ __func__, i, ggml_used_mem(ctx));
if (ctx->mem_buffer_owned) {
GGML_ALIGNED_FREE(ctx->mem_buffer);
if (GGML_OP_HAS_FINALIZE[node->op]) {
params.nth = n_tasks_arr[node_n];
ggml_compute_forward(¶ms, node);
- ggml_graph_compute_perf_stats_node(node, state->shared);
}
+ ggml_graph_compute_perf_stats_node(node, state->shared);
}
// distribute new work or execute it direct if 1T
if (GGML_OP_HAS_FINALIZE[node->op]) {
params.type = GGML_TASK_FINALIZE;
ggml_compute_forward(¶ms, node);
- ggml_graph_compute_perf_stats_node(node, state->shared);
}
+
+ ggml_graph_compute_perf_stats_node(node, state->shared);
} else {
break;
}
}
void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
- //assert(cgraph->work == NULL);
- //assert(cgraph->work_size == 0);
-
uint64_t size_eval = 0;
// compute size of intermediate results
GGML_PRINT("=== GRAPH ===\n");
- GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads);
- GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
-
GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
for (int i = 0; i < cgraph->n_nodes; i++) {
struct ggml_tensor * node = cgraph->nodes[i];