From: slaren Date: Mon, 12 Feb 2024 17:07:14 +0000 (+0100) Subject: ggml-alloc : allocate all leafs as if they were inputs (#731) X-Git-Tag: upstream/0.0.1642~961 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=849215767b689727818888158332ae9172e719f9;p=pkg%2Fggml%2Fsources%2Fggml ggml-alloc : allocate all leafs as if they were inputs (#731) * ggml-alloc : allocate all leafs as if they were inputs * ensure static leafs are allocated * gpt-2-backend : remove unnecesary ggml_new_tensor * update other gpt-2 examples to remove ggml_new_tensor calls in the graph --- diff --git a/examples/gpt-2/main-alloc.cpp b/examples/gpt-2/main-alloc.cpp index b0ddb52a..7a3197e6 100644 --- a/examples/gpt-2/main-alloc.cpp +++ b/examples/gpt-2/main-alloc.cpp @@ -407,11 +407,11 @@ struct ggml_cgraph * gpt2_graph( /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph() }; - struct ggml_context * ctx0 = ggml_init(params); + struct ggml_context * ctx = ggml_init(params); - struct ggml_cgraph * gf = ggml_new_graph(ctx0); + struct ggml_cgraph * gf = ggml_new_graph(ctx); - struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + struct ggml_tensor * embd = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); // at this point, the tensor data is not allocated yet and cannot be set // we will find the tensor after the graph is allocated by its name, and set the data then ggml_set_name(embd, "embd"); @@ -419,15 +419,15 @@ struct ggml_cgraph * gpt2_graph( // this is important to ensure that the input tensors are not overwritten before they are used ggml_set_input(embd); - struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + struct ggml_tensor * position = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); ggml_set_name(position, "position"); ggml_set_input(position); // wte + wpe struct ggml_tensor * inpL = - ggml_add(ctx0, - ggml_get_rows(ctx0, model.wte, embd), - ggml_get_rows(ctx0, model.wpe, position)); + ggml_add(ctx, + ggml_get_rows(ctx, model.wte, embd), + ggml_get_rows(ctx, model.wpe, position)); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * cur; @@ -435,15 +435,15 @@ struct ggml_cgraph * gpt2_graph( // norm { // [ 768, N] - cur = ggml_norm(ctx0, inpL, hparams.eps); + cur = ggml_norm(ctx, inpL, hparams.eps); // cur = ln_1_g*cur + ln_1_b // [ 768, N] - cur = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].ln_1_g, cur), + cur = ggml_add(ctx, + ggml_mul(ctx, + ggml_repeat(ctx, model.layers[il].ln_1_g, cur), cur), - ggml_repeat(ctx0, model.layers[il].ln_1_b, cur)); + ggml_repeat(ctx, model.layers[il].ln_1_b, cur)); } // attn @@ -455,45 +455,43 @@ struct ggml_cgraph * gpt2_graph( // cur = attn_w*cur + attn_b // [2304, N] { - cur = ggml_mul_mat(ctx0, + cur = ggml_mul_mat(ctx, model.layers[il].c_attn_attn_w, cur); - cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur), + cur = ggml_add(ctx, + ggml_repeat(ctx, model.layers[il].c_attn_attn_b, cur), cur); } // self-attention { - struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); - struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); - struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); + struct ggml_tensor * Qcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); + struct ggml_tensor * Kcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); + struct ggml_tensor * Vcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); // store key and value to memory if (N >= 1) { - struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); - struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * k = ggml_view_1d(ctx, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_1d(ctx, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + ggml_build_forward_expand(gf, ggml_cpy(ctx, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx, Vcur, v)); } // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) // [64, N, 12] struct ggml_tensor * Q = - ggml_permute(ctx0, - ggml_cpy(ctx0, - Qcur, - ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), + ggml_permute(ctx, + ggml_cont_3d(ctx, Qcur, n_embd/n_head, n_head, N), 0, 2, 1, 3); // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) // [64, n_past + N, 12] struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), + ggml_permute(ctx, + ggml_reshape_3d(ctx, + ggml_view_1d(ctx, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), n_embd/n_head, n_head, n_past + N), 0, 2, 1, 3); @@ -511,47 +509,45 @@ struct ggml_cgraph * gpt2_graph( // K * Q // [n_past + N, N, 12] - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + struct ggml_tensor * KQ = ggml_mul_mat(ctx, K, Q); // KQ_scaled = KQ / sqrt(n_embd/n_head) // [n_past + N, N, 12] struct ggml_tensor * KQ_scaled = - ggml_scale(ctx0, + ggml_scale(ctx, KQ, 1.0f/sqrtf(float(n_embd)/n_head)); // KQ_masked = mask_past(KQ_scaled) // [n_past + N, N, 12] - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx, KQ_scaled, n_past); // KQ = soft_max(KQ_masked) // [n_past + N, N, 12] - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx, KQ_masked); // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() // [n_past + N, 64, 12] struct ggml_tensor * V_trans = - ggml_cpy(ctx0, - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), + ggml_cont_3d(ctx, + ggml_permute(ctx, + ggml_reshape_3d(ctx, + ggml_view_1d(ctx, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), n_embd/n_head, n_head, n_past + N), 1, 2, 0, 3), - ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head)); + n_past + N, n_embd/n_head, n_head); // KQV = transpose(V) * KQ_soft_max // [64, N, 12] - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); + struct ggml_tensor * KQV = ggml_mul_mat(ctx, V_trans, KQ_soft_max); // KQV_merged = KQV.permute(0, 2, 1, 3) // [64, 12, N] - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + struct ggml_tensor * KQV_merged = ggml_permute(ctx, KQV, 0, 2, 1, 3); // cur = KQV_merged.contiguous().view(n_embd, N) // [768, N] - cur = ggml_cpy(ctx0, - KQV_merged, - ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + cur = ggml_cont_2d(ctx, KQV_merged, n_embd, N); } // projection @@ -563,17 +559,17 @@ struct ggml_cgraph * gpt2_graph( // cur = proj_w*cur + proj_b // [768, N] { - cur = ggml_mul_mat(ctx0, + cur = ggml_mul_mat(ctx, model.layers[il].c_attn_proj_w, cur); - cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), + cur = ggml_add(ctx, + ggml_repeat(ctx, model.layers[il].c_attn_proj_b, cur), cur); } // add the input - cur = ggml_add(ctx0, cur, inpL); + cur = ggml_add(ctx, cur, inpL); struct ggml_tensor * inpFF = cur; @@ -581,15 +577,15 @@ struct ggml_cgraph * gpt2_graph( { // norm { - cur = ggml_norm(ctx0, inpFF, hparams.eps); + cur = ggml_norm(ctx, inpFF, hparams.eps); // cur = ln_2_g*cur + ln_2_b // [ 768, N] - cur = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.layers[il].ln_2_g, cur), + cur = ggml_add(ctx, + ggml_mul(ctx, + ggml_repeat(ctx, model.layers[il].ln_2_g, cur), cur), - ggml_repeat(ctx0, model.layers[il].ln_2_b, cur)); + ggml_repeat(ctx, model.layers[il].ln_2_b, cur)); } // fully connected @@ -600,17 +596,17 @@ struct ggml_cgraph * gpt2_graph( // // cur = fc_w*cur + fc_b // [3072, N] - cur = ggml_mul_mat(ctx0, + cur = ggml_mul_mat(ctx, model.layers[il].c_mlp_fc_w, cur); - cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur), + cur = ggml_add(ctx, + ggml_repeat(ctx, model.layers[il].c_mlp_fc_b, cur), cur); // GELU activation // [3072, N] - cur = ggml_gelu(ctx0, cur); + cur = ggml_gelu(ctx, cur); // projection // [ 768, 3072] - model.layers[il].c_mlp_proj_w @@ -620,37 +616,37 @@ struct ggml_cgraph * gpt2_graph( // // cur = proj_w*cur + proj_b // [768, N] - cur = ggml_mul_mat(ctx0, + cur = ggml_mul_mat(ctx, model.layers[il].c_mlp_proj_w, cur); - cur = ggml_add(ctx0, - ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur), + cur = ggml_add(ctx, + ggml_repeat(ctx, model.layers[il].c_mlp_proj_b, cur), cur); } // input for next layer - inpL = ggml_add(ctx0, cur, inpFF); + inpL = ggml_add(ctx, cur, inpFF); } // norm { // [ 768, N] - inpL = ggml_norm(ctx0, inpL, hparams.eps); + inpL = ggml_norm(ctx, inpL, hparams.eps); // inpL = ln_f_g*inpL + ln_f_b // [ 768, N] - inpL = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.ln_f_g, inpL), + inpL = ggml_add(ctx, + ggml_mul(ctx, + ggml_repeat(ctx, model.ln_f_g, inpL), inpL), - ggml_repeat(ctx0, model.ln_f_b, inpL)); + ggml_repeat(ctx, model.ln_f_b, inpL)); } // inpL = WTE * inpL // [ 768, 50257] - model.lm_head // [ 768, N] - inpL - inpL = ggml_mul_mat(ctx0, model.lm_head, inpL); + inpL = ggml_mul_mat(ctx, model.lm_head, inpL); ggml_set_name(inpL, "logits"); // setting a tensor as the output will ensure that it is not overwritten by subsequent operations ggml_set_output(inpL); @@ -660,7 +656,7 @@ struct ggml_cgraph * gpt2_graph( ggml_build_forward_expand(gf, inpL); - ggml_free(ctx0); + ggml_free(ctx); return gf; } diff --git a/examples/gpt-2/main-backend.cpp b/examples/gpt-2/main-backend.cpp index cfa618f7..bd4cb80b 100644 --- a/examples/gpt-2/main-backend.cpp +++ b/examples/gpt-2/main-backend.cpp @@ -538,9 +538,7 @@ struct ggml_cgraph * gpt2_graph( // [64, N, 12] struct ggml_tensor * Q = ggml_permute(ctx, - ggml_cpy(ctx, - Qcur, - ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd/n_head, n_head, N)), + ggml_cont_3d(ctx, Qcur, n_embd/n_head, n_head, N), 0, 2, 1, 3); // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) @@ -586,13 +584,13 @@ struct ggml_cgraph * gpt2_graph( // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() // [n_past + N, 64, 12] struct ggml_tensor * V_trans = - ggml_cpy(ctx, + ggml_cont_3d(ctx, ggml_permute(ctx, ggml_reshape_3d(ctx, ggml_view_1d(ctx, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), n_embd/n_head, n_head, n_past + N), 1, 2, 0, 3), - ggml_new_tensor_3d(ctx, model.memory_v->type, n_past + N, n_embd/n_head, n_head)); + n_past + N, n_embd/n_head, n_head); // KQV = transpose(V) * KQ_soft_max // [64, N, 12] @@ -604,9 +602,7 @@ struct ggml_cgraph * gpt2_graph( // cur = KQV_merged.contiguous().view(n_embd, N) // [768, N] - cur = ggml_cpy(ctx, - KQV_merged, - ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, N)); + cur = ggml_cont_2d(ctx, KQV_merged, n_embd, N); } // projection diff --git a/examples/gpt-2/main-batched.cpp b/examples/gpt-2/main-batched.cpp index 51094467..6ad1838b 100644 --- a/examples/gpt-2/main-batched.cpp +++ b/examples/gpt-2/main-batched.cpp @@ -562,35 +562,35 @@ struct ggml_cgraph * gpt2_graph( /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph() }; - struct ggml_context * ctx0 = ggml_init(params); + struct ggml_context * ctx = ggml_init(params); - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GPT2_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, GPT2_MAX_NODES, false); struct ggml_tensor * inpL; if (batch.token) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens); ggml_set_name(inp_tokens, "inp_tokens"); ggml_set_input(inp_tokens); - struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + struct ggml_tensor * position = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens); ggml_set_name(position, "position"); ggml_set_input(position); // wte + wpe inpL = - ggml_add(ctx0, - ggml_get_rows(ctx0, model.wte, inp_tokens), - ggml_get_rows(ctx0, model.wpe, position)); + ggml_add(ctx, + ggml_get_rows(ctx, model.wte, inp_tokens), + ggml_get_rows(ctx, model.wpe, position)); } else { GGML_ASSERT(batch.embd); - inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); + inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens); ggml_set_name(inpL, "embd"); ggml_set_input(inpL); } // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_kv, n_tokens, 1); ggml_set_name(KQ_mask, "KQ_mask"); ggml_set_input(KQ_mask); @@ -601,12 +601,12 @@ struct ggml_cgraph * gpt2_graph( // norm { // [ 768, N] - cur = ggml_norm(ctx0, inpL, hparams.eps); + cur = ggml_norm(ctx, inpL, hparams.eps); // cur = ln_1_g*cur + ln_1_b // [ 768, N] - cur = ggml_add(ctx0, - ggml_mul(ctx0, + cur = ggml_add(ctx, + ggml_mul(ctx, cur, model.layers[il].ln_1_g), model.layers[il].ln_1_b); @@ -621,45 +621,45 @@ struct ggml_cgraph * gpt2_graph( // cur = attn_w*cur + attn_b // [2304, n_tokens] { - cur = ggml_mul_mat(ctx0, + cur = ggml_mul_mat(ctx, model.layers[il].c_attn_attn_w, cur); - cur = ggml_add(ctx0, + cur = ggml_add(ctx, cur, model.layers[il].c_attn_attn_b); } // self-attention { - struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd); - struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*n_embd); - struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*n_embd); + struct ggml_tensor * Qcur = ggml_view_2d(ctx, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd); + struct ggml_tensor * Kcur = ggml_view_2d(ctx, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*n_embd); + struct ggml_tensor * Vcur = ggml_view_2d(ctx, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*n_embd); // store key and value to memory if (n_tokens >= 1) { - struct ggml_tensor * k = ggml_view_1d(ctx0, model.kv_cache.k, n_tokens*n_embd, (ggml_element_size(model.kv_cache.k)*n_embd)*(il*n_ctx + kv_head)); - struct ggml_tensor * v = ggml_view_1d(ctx0, model.kv_cache.v, n_tokens*n_embd, (ggml_element_size(model.kv_cache.v)*n_embd)*(il*n_ctx + kv_head)); + struct ggml_tensor * k = ggml_view_1d(ctx, model.kv_cache.k, n_tokens*n_embd, (ggml_element_size(model.kv_cache.k)*n_embd)*(il*n_ctx + kv_head)); + struct ggml_tensor * v = ggml_view_1d(ctx, model.kv_cache.v, n_tokens*n_embd, (ggml_element_size(model.kv_cache.v)*n_embd)*(il*n_ctx + kv_head)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + ggml_build_forward_expand(gf, ggml_cpy(ctx, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx, Vcur, v)); } // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) // [64, N, 12] struct ggml_tensor * Q = - ggml_permute(ctx0, - ggml_cpy(ctx0, + ggml_permute(ctx, + ggml_cont_3d(ctx, Qcur, - ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, n_tokens)), + n_embd/n_head, n_head, n_tokens), 0, 2, 1, 3); // K = Kmem.view(n_embd/n_head, n_head, n_kv).permute(0, 2, 1, 3) // [64, n_kv, 12] struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.kv_cache.k, n_kv*n_embd, il*n_ctx*ggml_element_size(model.kv_cache.k)*n_embd), + ggml_permute(ctx, + ggml_reshape_3d(ctx, + ggml_view_1d(ctx, model.kv_cache.k, n_kv*n_embd, il*n_ctx*ggml_element_size(model.kv_cache.k)*n_embd), n_embd/n_head, n_head, n_kv), 0, 2, 1, 3); @@ -677,47 +677,45 @@ struct ggml_cgraph * gpt2_graph( // K * Q // [n_kv, n_tokens, 12] - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + struct ggml_tensor * KQ = ggml_mul_mat(ctx, K, Q); // KQ_scaled = KQ / sqrt(n_embd/n_head) // [n_kv, n_tokens, 12] struct ggml_tensor * KQ_scaled = - ggml_scale(ctx0, + ggml_scale(ctx, KQ, 1.0f/sqrtf(float(n_embd)/n_head)); // KQ_masked = mask_past(KQ_scaled) // [n_kv, n_tokens, 12] - struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); + struct ggml_tensor * KQ_masked = ggml_add(ctx, KQ_scaled, KQ_mask); // KQ = soft_max(KQ_masked) // [n_kv, N, 12] - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx, KQ_masked); // V_trans = Vmem.view(n_embd/n_head, n_head, n_kv).permute(1, 2, 0, 3).contiguous() // [n_kv, 64, 12] struct ggml_tensor * V_trans = - ggml_cpy(ctx0, - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.kv_cache.v, n_kv*n_embd, il*n_ctx*ggml_element_size(model.kv_cache.v)*n_embd), + ggml_cont_3d(ctx, + ggml_permute(ctx, + ggml_reshape_3d(ctx, + ggml_view_1d(ctx, model.kv_cache.v, n_kv*n_embd, il*n_ctx*ggml_element_size(model.kv_cache.v)*n_embd), n_embd/n_head, n_head, n_kv), 1, 2, 0, 3), - ggml_new_tensor_3d(ctx0, model.kv_cache.v->type, n_kv, n_embd/n_head, n_head)); + n_kv, n_embd/n_head, n_head); // KQV = transpose(V) * KQ_soft_max // [64, n_tokens, 12] - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); + struct ggml_tensor * KQV = ggml_mul_mat(ctx, V_trans, KQ_soft_max); // KQV_merged = KQV.permute(0, 2, 1, 3) // [64, 12, n_tokens] - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + struct ggml_tensor * KQV_merged = ggml_permute(ctx, KQV, 0, 2, 1, 3); // cur = KQV_merged.contiguous().view(n_embd, N) // [768, n_tokens] - cur = ggml_cpy(ctx0, - KQV_merged, - ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens)); + cur = ggml_cont_2d(ctx, KQV_merged, n_embd, n_tokens); } // projection @@ -729,17 +727,17 @@ struct ggml_cgraph * gpt2_graph( // cur = proj_w*cur + proj_b // [768, N] { - cur = ggml_mul_mat(ctx0, + cur = ggml_mul_mat(ctx, model.layers[il].c_attn_proj_w, cur); - cur = ggml_add(ctx0, + cur = ggml_add(ctx, cur, model.layers[il].c_attn_proj_b); } // add the input - cur = ggml_add(ctx0, cur, inpL); + cur = ggml_add(ctx, cur, inpL); struct ggml_tensor * inpFF = cur; @@ -747,12 +745,12 @@ struct ggml_cgraph * gpt2_graph( { // norm { - cur = ggml_norm(ctx0, inpFF, hparams.eps); + cur = ggml_norm(ctx, inpFF, hparams.eps); // cur = ln_2_g*cur + ln_2_b // [ 768, N] - cur = ggml_add(ctx0, - ggml_mul(ctx0, + cur = ggml_add(ctx, + ggml_mul(ctx, cur, model.layers[il].ln_2_g), model.layers[il].ln_2_b); @@ -766,17 +764,17 @@ struct ggml_cgraph * gpt2_graph( // // cur = fc_w*cur + fc_b // [3072, N] - cur = ggml_mul_mat(ctx0, + cur = ggml_mul_mat(ctx, model.layers[il].c_mlp_fc_w, cur); - cur = ggml_add(ctx0, + cur = ggml_add(ctx, cur, model.layers[il].c_mlp_fc_b); // GELU activation // [3072, N] - cur = ggml_gelu(ctx0, cur); + cur = ggml_gelu(ctx, cur); // projection // [ 768, 3072] - model.layers[il].c_mlp_proj_w @@ -786,28 +784,28 @@ struct ggml_cgraph * gpt2_graph( // // cur = proj_w*cur + proj_b // [768, N] - cur = ggml_mul_mat(ctx0, + cur = ggml_mul_mat(ctx, model.layers[il].c_mlp_proj_w, cur); - cur = ggml_add(ctx0, + cur = ggml_add(ctx, cur, model.layers[il].c_mlp_proj_b); } // input for next layer - inpL = ggml_add(ctx0, cur, inpFF); + inpL = ggml_add(ctx, cur, inpFF); } // norm { // [ 768, N] - inpL = ggml_norm(ctx0, inpL, hparams.eps); + inpL = ggml_norm(ctx, inpL, hparams.eps); // inpL = ln_f_g*inpL + ln_f_b // [ 768, N] - inpL = ggml_add(ctx0, - ggml_mul(ctx0, + inpL = ggml_add(ctx, + ggml_mul(ctx, inpL, model.ln_f_g), model.ln_f_b); @@ -816,14 +814,14 @@ struct ggml_cgraph * gpt2_graph( // inpL = WTE * inpL // [ 768, 50257] - model.lm_head // [ 768, N] - inpL - inpL = ggml_mul_mat(ctx0, model.lm_head, inpL); + inpL = ggml_mul_mat(ctx, model.lm_head, inpL); // logits -> probs //inpL = ggml_soft_max(ctx0, inpL); ggml_build_forward_expand(gf, inpL); - ggml_free(ctx0); + ggml_free(ctx); return gf; } diff --git a/examples/gpt-2/main-sched.cpp b/examples/gpt-2/main-sched.cpp index b5b8af61..e753d5fb 100644 --- a/examples/gpt-2/main-sched.cpp +++ b/examples/gpt-2/main-sched.cpp @@ -557,17 +557,17 @@ struct ggml_cgraph * gpt2_graph( /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph() }; - struct ggml_context * ctx0 = ggml_init(params); + struct ggml_context * ctx = ggml_init(params); - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GPT2_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, GPT2_MAX_NODES, false); - struct ggml_tensor * embd = ggml_view_1d(ctx0, model.embd, N, 0); + struct ggml_tensor * embd = ggml_view_1d(ctx, model.embd, N, 0); // set inputs // TODO: move to gpt2_eval ggml_backend_tensor_set(model.embd, embd_inp.data(), 0, N*ggml_element_size(embd)); - struct ggml_tensor * position = ggml_view_1d(ctx0, model.position, N, 0); + struct ggml_tensor * position = ggml_view_1d(ctx, model.position, N, 0); for (int i = 0; i < N; ++i) { int32_t v = n_past + i; ggml_backend_tensor_set(model.position, &v, i*sizeof(int32_t), sizeof(v)); @@ -577,9 +577,9 @@ struct ggml_cgraph * gpt2_graph( // wte + wpe struct ggml_tensor * inpL = - ggml_add(ctx0, - ggml_get_rows(ctx0, model.wte, embd), - ggml_get_rows(ctx0, model.wpe, position)); + ggml_add(ctx, + ggml_get_rows(ctx, model.wte, embd), + ggml_get_rows(ctx, model.wpe, position)); ggml_set_name(inpL, "inpL"); ggml_set_name(inpL->src[0], "wte"); ggml_set_name(inpL->src[1], "wpe"); @@ -590,13 +590,13 @@ struct ggml_cgraph * gpt2_graph( // norm { // [ 768, N] - cur = ggml_norm(ctx0, inpL, hparams.eps); + cur = ggml_norm(ctx, inpL, hparams.eps); ggml_format_name(cur, "l%d.norm", il); // cur = ln_1_g*cur + ln_1_b // [ 768, N] - cur = ggml_add(ctx0, - ggml_mul(ctx0, + cur = ggml_add(ctx, + ggml_mul(ctx, cur, model.layers[il].ln_1_g), model.layers[il].ln_1_b); @@ -613,12 +613,12 @@ struct ggml_cgraph * gpt2_graph( // cur = attn_w*cur + attn_b // [2304, N] { - cur = ggml_mul_mat(ctx0, + cur = ggml_mul_mat(ctx, model.layers[il].c_attn_attn_w, cur); ggml_format_name(cur, "l%d.attn_w", il); - cur = ggml_add(ctx0, + cur = ggml_add(ctx, cur, model.layers[il].c_attn_attn_b); ggml_format_name(cur, "l%d.attn_b", il); @@ -626,9 +626,9 @@ struct ggml_cgraph * gpt2_graph( // self-attention { - struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); - struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); - struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); + struct ggml_tensor * Qcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); + struct ggml_tensor * Kcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); + struct ggml_tensor * Vcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); ggml_format_name(Qcur, "l%d.Qcur", il); ggml_format_name(Kcur, "l%d.Kcur", il); @@ -636,29 +636,27 @@ struct ggml_cgraph * gpt2_graph( // store key and value to memory if (N >= 1) { - struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); - struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * k = ggml_view_1d(ctx, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_1d(ctx, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + ggml_build_forward_expand(gf, ggml_cpy(ctx, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx, Vcur, v)); } // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) // [64, N, 12] struct ggml_tensor * Q = - ggml_permute(ctx0, - ggml_cpy(ctx0, - Qcur, - ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), + ggml_permute(ctx, + ggml_cont_3d(ctx, Qcur, n_embd/n_head, n_head, N), 0, 2, 1, 3); ggml_format_name(Q, "l%d.Q", il); // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) // [64, n_past + N, 12] struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), + ggml_permute(ctx, + ggml_reshape_3d(ctx, + ggml_view_1d(ctx, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), n_embd/n_head, n_head, n_past + N), 0, 2, 1, 3); ggml_format_name(K, "l%d.K", il); @@ -677,51 +675,48 @@ struct ggml_cgraph * gpt2_graph( // K * Q // [n_past + N, N, 12] - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + struct ggml_tensor * KQ = ggml_mul_mat(ctx, K, Q); ggml_format_name(KQ, "l%d.KQ", il); // KQ_scaled = KQ / sqrt(n_embd/n_head) // [n_past + N, N, 12] - struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); + struct ggml_tensor * KQ_scaled = ggml_scale(ctx, KQ, KQ_scale); ggml_format_name(KQ_scaled, "l%d.KQ_scaled", il); // KQ_masked = mask_past(KQ_scaled) // [n_past + N, N, 12] - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx, KQ_scaled, n_past); ggml_format_name(KQ_masked, "l%d.KQ_masked", il); // KQ = soft_max(KQ_masked) // [n_past + N, N, 12] - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx, KQ_masked); ggml_format_name(KQ_soft_max, "l%d.KQ_soft_max", il); // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() // [n_past + N, 64, 12] struct ggml_tensor * V_trans = - ggml_cpy(ctx0, - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), + ggml_cont_3d(ctx, + ggml_permute(ctx, + ggml_reshape_3d(ctx, + ggml_view_1d(ctx, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), n_embd/n_head, n_head, n_past + N), 1, 2, 0, 3), - ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head)); - ggml_format_name(V_trans, "l%d.V_trans", il); + n_past + N, n_embd/n_head, n_head); // KQV = transpose(V) * KQ_soft_max // [64, N, 12] - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); + struct ggml_tensor * KQV = ggml_mul_mat(ctx, V_trans, KQ_soft_max); ggml_format_name(KQV, "l%d.KQV", il); // KQV_merged = KQV.permute(0, 2, 1, 3) // [64, 12, N] - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + struct ggml_tensor * KQV_merged = ggml_permute(ctx, KQV, 0, 2, 1, 3); ggml_format_name(KQV_merged, "l%d.KQV_merged", il); // cur = KQV_merged.contiguous().view(n_embd, N) // [768, N] - cur = ggml_cpy(ctx0, - KQV_merged, - ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + cur = ggml_cont_2d(ctx, KQV_merged, n_embd, N); ggml_format_name(cur, "l%d.KQV_merged_contiguous", il); } @@ -734,19 +729,19 @@ struct ggml_cgraph * gpt2_graph( // cur = proj_w*cur + proj_b // [768, N] { - cur = ggml_mul_mat(ctx0, + cur = ggml_mul_mat(ctx, model.layers[il].c_attn_proj_w, cur); ggml_format_name(cur, "l%d.attn_proj_w", il); - cur = ggml_add(ctx0, + cur = ggml_add(ctx, cur, model.layers[il].c_attn_proj_b); ggml_format_name(cur, "l%d.attn_proj_b", il); } // add the input - cur = ggml_add(ctx0, cur, inpL); + cur = ggml_add(ctx, cur, inpL); ggml_format_name(cur, "l%d.add", il); struct ggml_tensor * inpFF = cur; @@ -755,13 +750,13 @@ struct ggml_cgraph * gpt2_graph( { // norm { - cur = ggml_norm(ctx0, inpFF, hparams.eps); + cur = ggml_norm(ctx, inpFF, hparams.eps); ggml_format_name(cur, "l%d.FFnorm", il); // cur = ln_2_g*cur + ln_2_b // [ 768, N] - cur = ggml_add(ctx0, - ggml_mul(ctx0, + cur = ggml_add(ctx, + ggml_mul(ctx, cur, model.layers[il].ln_2_g), model.layers[il].ln_2_b); @@ -777,19 +772,19 @@ struct ggml_cgraph * gpt2_graph( // // cur = fc_w*cur + fc_b // [3072, N] - cur = ggml_mul_mat(ctx0, + cur = ggml_mul_mat(ctx, model.layers[il].c_mlp_fc_w, cur); ggml_format_name(cur, "l%d.mlp_fc_w", il); - cur = ggml_add(ctx0, + cur = ggml_add(ctx, cur, model.layers[il].c_mlp_fc_b); ggml_format_name(cur, "l%d.mlp_fc_b", il); // GELU activation // [3072, N] - cur = ggml_gelu(ctx0, cur); + cur = ggml_gelu(ctx, cur); ggml_format_name(cur, "l%d.gelu", il); // projection @@ -800,32 +795,32 @@ struct ggml_cgraph * gpt2_graph( // // cur = proj_w*cur + proj_b // [768, N] - cur = ggml_mul_mat(ctx0, + cur = ggml_mul_mat(ctx, model.layers[il].c_mlp_proj_w, cur); ggml_format_name(cur, "l%d.mlp_proj_w", il); - cur = ggml_add(ctx0, + cur = ggml_add(ctx, cur, model.layers[il].c_mlp_proj_b); ggml_format_name(cur, "l%d.mlp_proj_b", il); } // input for next layer - inpL = ggml_add(ctx0, cur, inpFF); + inpL = ggml_add(ctx, cur, inpFF); ggml_format_name(inpL, "l%d.add2", il); } // norm { // [ 768, N] - inpL = ggml_norm(ctx0, inpL, hparams.eps); + inpL = ggml_norm(ctx, inpL, hparams.eps); ggml_format_name(inpL, "out_norm"); // inpL = ln_f_g*inpL + ln_f_b // [ 768, N] - inpL = ggml_add(ctx0, - ggml_mul(ctx0, + inpL = ggml_add(ctx, + ggml_mul(ctx, inpL, model.ln_f_g), model.ln_f_b); @@ -836,7 +831,7 @@ struct ggml_cgraph * gpt2_graph( // inpL = WTE * inpL // [ 768, 50257] - model.lm_head // [ 768, N] - inpL - inpL = ggml_mul_mat(ctx0, model.lm_head, inpL); + inpL = ggml_mul_mat(ctx, model.lm_head, inpL); ggml_format_name(inpL, "out_lm_head"); // logits -> probs @@ -844,7 +839,7 @@ struct ggml_cgraph * gpt2_graph( ggml_build_forward_expand(gf, inpL); - ggml_free(ctx0); + ggml_free(ctx); return gf; } diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c index c28c37c4..56d59669 100644 --- a/src/ggml-alloc.c +++ b/src/ggml-alloc.c @@ -377,6 +377,9 @@ struct ggml_gallocr { struct node_alloc * node_allocs; // [n_nodes] int n_nodes; + + struct tensor_alloc * leaf_allocs; // [n_leafs] + int n_leafs; }; ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) { @@ -427,6 +430,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) { free(galloc->buffers); free(galloc->buf_tallocs); free(galloc->node_allocs); + free(galloc->leaf_allocs); free(galloc); } @@ -544,22 +548,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *)); memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node)); - // allocate all graph inputs first to avoid overwriting them - for (int i = 0; i < graph->n_nodes; i++) { - if (graph->nodes[i]->flags & GGML_TENSOR_FLAG_INPUT) { - ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i)); - } - for (int j = 0; j < GGML_MAX_SRC; j++) { - if (graph->nodes[i]->src[j] == NULL) { - break; - } - if (graph->nodes[i]->src[j]->flags & GGML_TENSOR_FLAG_INPUT) { - ggml_gallocr_allocate_node(galloc, graph->nodes[i]->src[j], get_node_buffer_id(node_buffer_ids, i)); - } - } - } - // count number of children and views + // allocate all graph inputs and leafs first to avoid overwriting them for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; @@ -568,14 +558,37 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr ggml_gallocr_hash_get(galloc, view_src)->n_views += 1; } + if (node->flags & GGML_TENSOR_FLAG_INPUT) { + ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i)); + } + for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * parent = node->src[j]; - if (parent == NULL) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { break; } - ggml_gallocr_hash_get(galloc, parent)->n_children += 1; + + ggml_gallocr_hash_get(galloc, src)->n_children += 1; + + // allocate explicit inputs and leafs + if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) { + ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i)); + } } - } + } + + // allocate the remaining leafs that are unused on the graph + // these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes + for (int i = 0; i < graph->n_leafs; i++) { + struct ggml_tensor * leaf = graph->leafs[i]; + struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf); + + if (hn->n_children == 0) { + assert(!hn->allocated); + // since buffer ids are only given for nodes, these leafs are always allocated in the first buffer + ggml_gallocr_allocate_node(galloc, leaf, 0); + } + } // allocate tensors for (int i = 0; i < graph->n_nodes; i++) { @@ -696,6 +709,18 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c } } } + if (galloc->n_leafs < graph->n_leafs) { + free(galloc->leaf_allocs); + galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs); + GGML_ASSERT(galloc->leaf_allocs != NULL); + } + galloc->n_leafs = graph->n_leafs; + for (int i = 0; i < graph->n_leafs; i++) { + struct ggml_tensor * leaf = graph->leafs[i]; + struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf); + galloc->leaf_allocs[i].offset = hn->offset; + galloc->leaf_allocs[i].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf); + } // reallocate buffers if needed for (int i = 0; i < galloc->n_buffers; i++) { @@ -722,8 +747,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) { return ggml_gallocr_reserve_n(galloc, graph, NULL); } -static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * node_alloc, struct tensor_alloc * tensor_alloc) { - assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max); +static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) { + assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max); if (node->view_src != NULL) { if (node->buffer == NULL) { @@ -732,29 +757,20 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * // this tensor was allocated without ggml-backend return; } - ggml_backend_view_init(galloc->buffers[node_alloc->buffer_id], node); + ggml_backend_view_init(galloc->buffers[buffer_id], node); } } else { if (node->data == NULL) { assert(tensor_alloc->offset != SIZE_MAX); - assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max); - void * base = ggml_backend_buffer_get_base(galloc->buffers[node_alloc->buffer_id]); + assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max); + void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]); void * addr = (char *)base + tensor_alloc->offset; - ggml_backend_tensor_alloc(galloc->buffers[node_alloc->buffer_id], node, addr); + ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr); } else { if (node->buffer == NULL) { // this tensor was allocated without ggml-backend return; } - -#ifndef NDEBUG - size_t offset = - (char *)node->data - - (char *)ggml_backend_buffer_get_base(node->buffer); - size_t size = ggml_backend_buffer_get_alloc_size(node->buffer, node); - assert(tensor_alloc->offset == SIZE_MAX || offset == tensor_alloc->offset); - assert(tensor_alloc->offset == SIZE_MAX || size <= tensor_alloc->size_max); -#endif } } } @@ -773,6 +789,13 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph return true; } + if (galloc->n_leafs != graph->n_leafs) { +#ifndef NDEBUG + fprintf(stderr, "%s: graph has different number of leafs\n", __func__); +#endif + return true; + } + for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; struct node_alloc * node_alloc = &galloc->node_allocs[i]; @@ -827,6 +850,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) } // allocate the graph tensors from the previous assignments + // nodes for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; struct node_alloc * node_alloc = &galloc->node_allocs[i]; @@ -835,9 +859,15 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) if (src == NULL) { break; } - ggml_gallocr_init_tensor(galloc, src, node_alloc, &node_alloc->src[j]); + ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]); } - ggml_gallocr_init_tensor(galloc, node, node_alloc, &node_alloc->dst); + ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst); + } + // leafs + for (int i = 0; i < graph->n_leafs; i++) { + struct ggml_tensor * leaf = graph->leafs[i]; + struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i]; + ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc); } return true;