}
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
+ const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
+ const struct llama_model * model = llama_get_model(ctx);
+
// clear previous kv_cache values (irrelevant for embeddings)
llama_kv_cache_clear(ctx);
// run model
fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
- if (llama_decode(ctx, batch) < 0) {
- fprintf(stderr, "%s : failed to decode\n", __func__);
+ if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
+ // encoder-only model
+ if (llama_encode(ctx, batch) < 0) {
+ fprintf(stderr, "%s : failed to encode\n", __func__);
+ }
+ } else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
+ // decoder-only model
+ if (llama_decode(ctx, batch) < 0) {
+ fprintf(stderr, "%s : failed to decode\n", __func__);
+ }
}
for (int i = 0; i < batch.n_tokens; i++) {
continue;
}
- // try to get sequence embeddings - supported only when pooling_type is not NONE
- const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
- GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
+ const float * embd = nullptr;
+ int embd_pos = 0;
+
+ if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
+ // try to get token embeddings
+ embd = llama_get_embeddings_ith(ctx, i);
+ embd_pos = i;
+ GGML_ASSERT(embd != NULL && "failed to get token embeddings");
+ } else {
+ // try to get sequence embeddings - supported only when pooling_type is not NONE
+ embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+ embd_pos = batch.seq_id[i][0];
+ GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
+ }
- float * out = output + batch.seq_id[i][0] * n_embd;
+ float * out = output + embd_pos * n_embd;
llama_embd_normalize(embd, out, n_embd, embd_norm);
}
}
const int n_ctx = llama_n_ctx(ctx);
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
- if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
- fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
+
+ if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
+ fprintf(stderr, "%s: error: computing embeddings in encoder-decoder models is not supported\n", __func__);
return 1;
}
const int n_prompts = prompts.size();
struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
+ // count number of embeddings
+ int n_embd_count = 0;
+ if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
+ for (int k = 0; k < n_prompts; k++) {
+ n_embd_count += inputs[k].size();
+ }
+ } else {
+ n_embd_count = n_prompts;
+ }
+
// allocate output
const int n_embd = llama_n_embd(model);
- std::vector<float> embeddings(n_prompts * n_embd, 0);
+ std::vector<float> embeddings(n_embd_count * n_embd, 0);
float * emb = embeddings.data();
// break into batches
- int p = 0; // number of prompts processed already
+ int e = 0; // number of embeddings already stored
int s = 0; // number of prompts in current batch
for (int k = 0; k < n_prompts; k++) {
// clamp to n_batch tokens
// encode if at capacity
if (batch.n_tokens + n_toks > n_batch) {
- float * out = emb + p * n_embd;
+ float * out = emb + e * n_embd;
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
- llama_batch_clear(batch);
- p += s;
+ e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
s = 0;
+ llama_batch_clear(batch);
}
// add to batch
}
// final batch
- float * out = emb + p * n_embd;
+ float * out = emb + e * n_embd;
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
if (params.embd_out.empty()) {
- // print the first part of the embeddings or for a single prompt, the full embedding
fprintf(stdout, "\n");
- for (int j = 0; j < n_prompts; j++) {
- fprintf(stdout, "embedding %d: ", j);
- for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
- if (params.embd_normalize == 0) {
- fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
- } else {
- fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
+
+ if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
+ for (int j = 0; j < n_embd_count; j++) {
+ fprintf(stdout, "embedding %d: ", j);
+ for (int i = 0; i < std::min(3, n_embd); i++) {
+ if (params.embd_normalize == 0) {
+ fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
+ } else {
+ fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
+ }
+ }
+ fprintf(stdout, " ... ");
+ for (int i = n_embd - 3; i < n_embd; i++) {
+ if (params.embd_normalize == 0) {
+ fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
+ } else {
+ fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
+ }
}
+ fprintf(stdout, "\n");
}
- fprintf(stdout, "\n");
- }
-
- // print cosine similarity matrix
- if (n_prompts > 1) {
- fprintf(stdout, "\n");
- printf("cosine similarity matrix:\n\n");
- for (int i = 0; i < n_prompts; i++) {
- fprintf(stdout, "%6.6s ", prompts[i].c_str());
+ } else {
+ // print the first part of the embeddings or for a single prompt, the full embedding
+ for (int j = 0; j < n_prompts; j++) {
+ fprintf(stdout, "embedding %d: ", j);
+ for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
+ if (params.embd_normalize == 0) {
+ fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
+ } else {
+ fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
+ }
+ }
+ fprintf(stdout, "\n");
}
- fprintf(stdout, "\n");
- for (int i = 0; i < n_prompts; i++) {
- for (int j = 0; j < n_prompts; j++) {
- float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
- fprintf(stdout, "%6.2f ", sim);
+
+ // print cosine similarity matrix
+ if (n_prompts > 1) {
+ fprintf(stdout, "\n");
+ printf("cosine similarity matrix:\n\n");
+ for (int i = 0; i < n_prompts; i++) {
+ fprintf(stdout, "%6.6s ", prompts[i].c_str());
}
- fprintf(stdout, "%1.10s", prompts[i].c_str());
fprintf(stdout, "\n");
+ for (int i = 0; i < n_prompts; i++) {
+ for (int j = 0; j < n_prompts; j++) {
+ float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
+ fprintf(stdout, "%6.2f ", sim);
+ }
+ fprintf(stdout, "%1.10s", prompts[i].c_str());
+ fprintf(stdout, "\n");
+ }
}
}
}
}
fprintf(stdout, notArray ? "]\n }" : "]");
j++;
- if (j < n_prompts) fprintf(stdout, notArray ? ",\n" : ","); else break;
+ if (j < n_embd_count) fprintf(stdout, notArray ? ",\n" : ","); else break;
}
fprintf(stdout, notArray ? "\n ]" : "]\n");
if (params.embd_out == "json+" && n_prompts > 1) {
fprintf(stdout, ",\n \"cosineSimilarity\": [\n");
- for (int i = 0;;) { // at least two iteration (n_prompts > 1)
+ for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
fprintf(stdout, " [");
- for (int j = 0;;) { // at least two iteration (n_prompts > 1)
+ for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
fprintf(stdout, "%6.2f", sim);
j++;
- if (j < n_prompts) fprintf(stdout, ", "); else break;
+ if (j < n_embd_count) fprintf(stdout, ", "); else break;
}
fprintf(stdout, " ]");
i++;
- if (i < n_prompts) fprintf(stdout, ",\n"); else break;
+ if (i < n_embd_count) fprintf(stdout, ",\n"); else break;
}
fprintf(stdout, "\n ]");
}
LLM_ARCH_CHATGLM,
LLM_ARCH_BITNET,
LLM_ARCH_T5,
+ LLM_ARCH_T5ENCODER,
LLM_ARCH_JAIS,
LLM_ARCH_UNKNOWN,
};
{ LLM_ARCH_CHATGLM, "chatglm" },
{ LLM_ARCH_BITNET, "bitnet" },
{ LLM_ARCH_T5, "t5" },
+ { LLM_ARCH_T5ENCODER, "t5encoder" },
{ LLM_ARCH_JAIS, "jais" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
{ LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" },
},
},
+ {
+ LLM_ARCH_T5ENCODER,
+ {
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
+ { LLM_TENSOR_OUTPUT, "output" },
+ { LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" },
+ { LLM_TENSOR_ENC_ATTN_NORM, "enc.blk.%d.attn_norm" },
+ { LLM_TENSOR_ENC_ATTN_Q, "enc.blk.%d.attn_q" },
+ { LLM_TENSOR_ENC_ATTN_K, "enc.blk.%d.attn_k" },
+ { LLM_TENSOR_ENC_ATTN_V, "enc.blk.%d.attn_v" },
+ { LLM_TENSOR_ENC_ATTN_OUT, "enc.blk.%d.attn_o" },
+ { LLM_TENSOR_ENC_ATTN_REL_B, "enc.blk.%d.attn_rel_b" },
+ { LLM_TENSOR_ENC_FFN_NORM, "enc.blk.%d.ffn_norm" },
+ { LLM_TENSOR_ENC_FFN_GATE, "enc.blk.%d.ffn_gate" },
+ { LLM_TENSOR_ENC_FFN_DOWN, "enc.blk.%d.ffn_down" },
+ { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" },
+ },
+ },
{
LLM_ARCH_JAIS,
{
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
+ case LLM_ARCH_T5ENCODER:
+ {
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+ ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
+ model.type = e_model::MODEL_UNKNOWN;
+ } break;
case LLM_ARCH_JAIS:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff});
}
} break;
+ case LLM_ARCH_T5ENCODER:
+ {
+ const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
+
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+ // output
+ {
+ model.output_norm_enc = ml.create_tensor(ctx_output, tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd});
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ // if output is NULL, init from the input tok embed
+ if (model.output == NULL) {
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+ }
+ }
+
+ for (int i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
+
+ auto & layer = model.layers[i];
+
+ layer.attn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd});
+ layer.attn_rel_b_enc = ml.create_tensor(ctx_input, tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
+
+ layer.wq_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
+ layer.wk_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
+ layer.wv_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
+ layer.wo_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd});
+
+ layer.ffn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd});
+ layer.ffn_gate_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
+ layer.ffn_down_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd});
+ layer.ffn_up_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff});
+ }
+ } break;
case LLM_ARCH_JAIS:
{
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
return gf;
}
- struct ggml_cgraph * build_t5() {
+ struct ggml_cgraph * build_t5_encoder() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
- if (lctx.is_encoding) {
- struct ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false);
+ GGML_ASSERT(lctx.is_encoding);
+ struct ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false);
- // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
- struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false);
-
- for (int il = 0; il < n_layer; ++il) {
- struct ggml_tensor * inpSA = inpL;
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+ struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false);
- // norm
- cur = llm_build_norm(ctx0, inpL, hparams,
- model.layers[il].attn_norm_enc, NULL,
- LLM_NORM_RMS, cb, il);
- cb(cur, "attn_norm", il);
+ for (int il = 0; il < n_layer; ++il) {
+ struct ggml_tensor * inpSA = inpL;
- // self-attention
- {
- struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_enc, cur);
- cb(Qcur, "Qcur", il);
+ // norm
+ cur = llm_build_norm(ctx0, inpL, hparams,
+ model.layers[il].attn_norm_enc, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "attn_norm", il);
- struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_enc, cur);
- cb(Kcur, "Kcur", il);
+ // self-attention
+ {
+ struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_enc, cur);
+ cb(Qcur, "Qcur", il);
- struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_enc, cur);
- cb(Vcur, "Vcur", il);
+ struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_enc, cur);
+ cb(Kcur, "Kcur", il);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+ struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_enc, cur);
+ cb(Vcur, "Vcur", il);
- struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
- struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
- struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
- cb(kq, "kq", il);
+ struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+ struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
- struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
- struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_enc, attn_rel_b);
- struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
- cb(kq_b, "kq_b", il);
+ struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+ cb(kq, "kq", il);
- kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias);
- cb(kq, "kq_soft_max_ext", il);
+ struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
+ struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_enc, attn_rel_b);
+ struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
+ cb(kq_b, "kq_b", il);
- struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
- cb(v, "v", il);
+ kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias);
+ cb(kq, "kq_soft_max_ext", il);
- struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
- cb(kqv, "kqv", il);
+ struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
+ cb(v, "v", il);
- struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
- cb(kqv_merged, "kqv_merged", il);
+ struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
+ cb(kqv, "kqv", il);
- cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
- cb(cur, "kqv_merged_cont", il);
+ struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+ cb(kqv_merged, "kqv_merged", il);
- ggml_build_forward_expand(gf, cur);
+ cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
+ cb(cur, "kqv_merged_cont", il);
- cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_enc, cur);
- cb(cur, "kqv_out", il);
- }
+ ggml_build_forward_expand(gf, cur);
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
- n_tokens = n_outputs;
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- }
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_enc, cur);
+ cb(cur, "kqv_out", il);
+ }
- struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
- cb(ffn_inp, "ffn_inp", il);
+ if (il == n_layer - 1) {
+ // skip computing output for unused tokens
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+ n_tokens = n_outputs;
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ }
- // feed-forward network
- {
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
- model.layers[il].ffn_norm_enc, NULL,
- LLM_NORM_RMS, cb, il);
- cb(cur, "ffn_norm", il);
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+ cb(ffn_inp, "ffn_inp", il);
- // T5 uses relu, flan-T5 uses gelu-gated
- cur = llm_build_ffn(ctx0, lctx, cur,
- model.layers[il].ffn_up_enc, NULL, NULL,
- model.layers[il].ffn_gate_enc, NULL, NULL,
- model.layers[il].ffn_down_enc, NULL, NULL,
- NULL,
- model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
- model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
- cb, il);
- cb(cur, "ffn_out", il);
- }
+ // feed-forward network
+ {
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
+ model.layers[il].ffn_norm_enc, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "ffn_norm", il);
- cur = ggml_add(ctx0, cur, ffn_inp);
+ // T5 uses relu, flan-T5 uses gelu-gated
+ cur = llm_build_ffn(ctx0, lctx, cur,
+ model.layers[il].ffn_up_enc, NULL, NULL,
+ model.layers[il].ffn_gate_enc, NULL, NULL,
+ model.layers[il].ffn_down_enc, NULL, NULL,
+ NULL,
+ model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
+ model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
+ cb, il);
cb(cur, "ffn_out", il);
+ }
- ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
- if (layer_dir != nullptr) {
- cur = ggml_add(ctx0, cur, layer_dir);
- }
- cb(cur, "l_out", il);
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
- // input for next layer
- inpL = cur;
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
+ if (layer_dir != nullptr) {
+ cur = ggml_add(ctx0, cur, layer_dir);
}
+ cb(cur, "l_out", il);
- cur = inpL;
- cb(cur, "result_embd", -1);
+ // input for next layer
+ inpL = cur;
+ }
- cur = llm_build_norm(ctx0, cur, hparams,
- model.output_norm_enc, NULL,
- LLM_NORM_RMS, cb, -1);
- cb(cur, "result_norm", -1);
- } else {
- GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
+ cur = inpL;
+ cb(cur, "result_embd", -1);
- struct ggml_tensor * embd_enc = llm_build_inp_embd_enc();
- struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true);
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.output_norm_enc, NULL,
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "result_norm", -1);
- struct ggml_tensor * KQ_mask_dec = build_inp_KQ_mask();
- struct ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross();
+ ggml_build_forward_expand(gf, cur);
- for (int il = 0; il < n_layer; ++il) {
- struct ggml_tensor * inpSA = inpL;
+ return gf;
+ }
- // norm
- cur = llm_build_norm(ctx0, inpL, hparams,
- model.layers[il].attn_norm, NULL,
- LLM_NORM_RMS, cb, il);
- cb(cur, "attn_norm", il);
+ struct ggml_cgraph * build_t5_decoder() {
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
- // self-attention
- {
- struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
- cb(Qcur, "Qcur", il);
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
+ int32_t n_tokens = this->n_tokens;
- struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
- cb(Kcur, "Kcur", il);
+ const int64_t n_embd_head = hparams.n_embd_head_v;
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
- struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
- cb(Vcur, "Vcur", il);
+ struct ggml_tensor * cur;
+ struct ggml_tensor * inpL;
- llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
- struct ggml_tensor * k =
- ggml_view_3d(ctx0, kv_self.k_l[il],
- n_embd_head_k, n_kv, n_head_kv,
- ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
- ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
- 0);
- cb(k, "k", il);
+ GGML_ASSERT(!lctx.is_encoding);
+ GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
- struct ggml_tensor * v =
- ggml_view_3d(ctx0, kv_self.v_l[il],
- n_kv, n_embd_head_v, n_head_kv,
- ggml_element_size(kv_self.v_l[il])*n_ctx,
- ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v,
- 0);
- cb(v, "v", il);
+ struct ggml_tensor * embd_enc = llm_build_inp_embd_enc();
+ struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ struct ggml_tensor * KQ_mask_dec = build_inp_KQ_mask();
+ struct ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross();
- struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+ for (int il = 0; il < n_layer; ++il) {
+ struct ggml_tensor * inpSA = inpL;
- struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
- cb(kq, "kq", il);
+ // norm
+ cur = llm_build_norm(ctx0, inpL, hparams,
+ model.layers[il].attn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "attn_norm", il);
- struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
- struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_dec, attn_rel_b);
- struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
- cb(kq_b, "kq_b", il);
+ // self-attention
+ {
+ struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+ cb(Qcur, "Qcur", il);
- kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias);
- cb(kq, "kq_soft_max_ext", il);
+ struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+ cb(Kcur, "Kcur", il);
- struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
- cb(kqv, "kqv", il);
+ struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+ cb(Vcur, "Vcur", il);
- struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
- cb(kqv_merged, "kqv_merged", il);
+ llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
- cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
- cb(cur, "kqv_merged_cont", il);
+ struct ggml_tensor * k =
+ ggml_view_3d(ctx0, kv_self.k_l[il],
+ n_embd_head_k, n_kv, n_head_kv,
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+ ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
+ 0);
+ cb(k, "k", il);
- ggml_build_forward_expand(gf, cur);
+ struct ggml_tensor * v =
+ ggml_view_3d(ctx0, kv_self.v_l[il],
+ n_kv, n_embd_head_v, n_head_kv,
+ ggml_element_size(kv_self.v_l[il])*n_ctx,
+ ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v,
+ 0);
+ cb(v, "v", il);
- cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
- cb(cur, "kqv_out", il);
- }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- cur = ggml_add(ctx0, cur, inpSA);
- cb(cur, "cross_inp", il);
+ struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
- struct ggml_tensor * inpCA = cur;
+ struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+ cb(kq, "kq", il);
- // norm
- cur = llm_build_norm(ctx0, cur, hparams,
- model.layers[il].attn_norm_cross, NULL,
- LLM_NORM_RMS, cb, il);
- cb(cur, "attn_norm_cross", il);
+ struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
+ struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_dec, attn_rel_b);
+ struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
+ cb(kq_b, "kq_b", il);
- // cross-attention
- {
- struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_cross, cur);
- cb(Qcur, "Qcur", il);
+ kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias);
+ cb(kq, "kq_soft_max_ext", il);
- struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_cross, embd_enc);
- cb(Kcur, "Kcur", il);
+ struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
+ cb(kqv, "kqv", il);
- struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_cross, embd_enc);
- cb(Vcur, "Vcur", il);
+ struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+ cb(kqv_merged, "kqv_merged", il);
- Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
+ cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
+ cb(cur, "kqv_merged_cont", il);
- struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
- struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
+ ggml_build_forward_expand(gf, cur);
- struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
- cb(kq, "kq", il);
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
+ cb(cur, "kqv_out", il);
+ }
- kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
- cb(kq, "kq_soft_max_ext", il);
+ cur = ggml_add(ctx0, cur, inpSA);
+ cb(cur, "cross_inp", il);
- struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
- cb(v, "v", il);
+ struct ggml_tensor * inpCA = cur;
- struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
- cb(kqv, "kqv", il);
+ // norm
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.layers[il].attn_norm_cross, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "attn_norm_cross", il);
- struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
- cb(kqv_merged, "kqv_merged", il);
+ // cross-attention
+ {
+ struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_cross, cur);
+ cb(Qcur, "Qcur", il);
- cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
- cb(cur, "kqv_merged_cont", il);
+ struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_cross, embd_enc);
+ cb(Kcur, "Kcur", il);
- ggml_build_forward_expand(gf, cur);
+ struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_cross, embd_enc);
+ cb(Vcur, "Vcur", il);
- cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_cross, cur);
- cb(cur, "kqv_out", il);
- }
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
- if (il == n_layer - 1) {
- // skip computing output for unused tokens
- struct ggml_tensor * inp_out_ids = build_inp_out_ids();
- n_tokens = n_outputs;
- cur = ggml_get_rows(ctx0, cur, inp_out_ids);
- inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
- inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
- }
+ struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+ struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
- struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA);
- cb(ffn_inp, "ffn_inp", il);
+ struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+ cb(kq, "kq", il);
- // feed-forward network
- {
- cur = llm_build_norm(ctx0, ffn_inp, hparams,
- model.layers[il].ffn_norm, NULL,
- LLM_NORM_RMS, cb, il);
- cb(cur, "ffn_norm", il);
+ kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
+ cb(kq, "kq_soft_max_ext", il);
- // T5 uses relu, flan-T5 uses gelu-gated
- cur = llm_build_ffn(ctx0, lctx, cur,
- model.layers[il].ffn_up, NULL, NULL,
- model.layers[il].ffn_gate, NULL, NULL,
- model.layers[il].ffn_down, NULL, NULL,
- NULL,
- model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
- model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
- cb, il);
- cb(cur, "ffn_out", il);
- }
+ struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
+ cb(v, "v", il);
- cur = ggml_add(ctx0, cur, ffn_inp);
- cb(cur, "ffn_out", il);
+ struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
+ cb(kqv, "kqv", il);
- ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
- if (layer_dir != nullptr) {
- cur = ggml_add(ctx0, cur, layer_dir);
- }
- cb(cur, "l_out", il);
+ struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
+ cb(kqv_merged, "kqv_merged", il);
- // input for next layer
- inpL = cur;
+ cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
+ cb(cur, "kqv_merged_cont", il);
+
+ ggml_build_forward_expand(gf, cur);
+
+ cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_cross, cur);
+ cb(cur, "kqv_out", il);
}
- cur = inpL;
- cb(cur, "result_embd", -1);
+ if (il == n_layer - 1) {
+ // skip computing output for unused tokens
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+ n_tokens = n_outputs;
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+ inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
+ }
- cur = llm_build_norm(ctx0, cur, hparams,
- model.output_norm, NULL,
- LLM_NORM_RMS, cb, -1);
- cb(cur, "result_norm", -1);
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA);
+ cb(ffn_inp, "ffn_inp", il);
- // lm_head
- cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
- cb(cur, "result_output", -1);
+ // feed-forward network
+ {
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
+ model.layers[il].ffn_norm, NULL,
+ LLM_NORM_RMS, cb, il);
+ cb(cur, "ffn_norm", il);
+
+ // T5 uses relu, flan-T5 uses gelu-gated
+ cur = llm_build_ffn(ctx0, lctx, cur,
+ model.layers[il].ffn_up, NULL, NULL,
+ model.layers[il].ffn_gate, NULL, NULL,
+ model.layers[il].ffn_down, NULL, NULL,
+ NULL,
+ model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
+ model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
+ cb, il);
+ cb(cur, "ffn_out", il);
+ }
+
+ cur = ggml_add(ctx0, cur, ffn_inp);
+ cb(cur, "ffn_out", il);
+
+ ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
+ if (layer_dir != nullptr) {
+ cur = ggml_add(ctx0, cur, layer_dir);
+ }
+ cb(cur, "l_out", il);
+
+ // input for next layer
+ inpL = cur;
}
+ cur = inpL;
+ cb(cur, "result_embd", -1);
+
+ cur = llm_build_norm(ctx0, cur, hparams,
+ model.output_norm, NULL,
+ LLM_NORM_RMS, cb, -1);
+ cb(cur, "result_norm", -1);
+
+ // lm_head
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+ cb(cur, "result_output", -1);
+
ggml_build_forward_expand(gf, cur);
return gf;
} break;
case LLM_ARCH_T5:
{
- result = llm.build_t5();
+ if (lctx.is_encoding) {
+ result = llm.build_t5_encoder();
+ } else {
+ result = llm.build_t5_decoder();
+ }
+ } break;
+ case LLM_ARCH_T5ENCODER:
+ {
+ result = llm.build_t5_encoder();
} break;
case LLM_ARCH_JAIS:
{
// TODO: use a per-batch flag for logits presence instead
const bool has_logits = !cparams.embeddings;
- const bool has_embd = lctx.is_encoding || (cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE));
+ const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
// the output embeddings after the final encoder normalization
- struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 1];
+ struct ggml_tensor * embd = nullptr;
- GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
+ // there are two cases here
+ if (llama_model_has_decoder(&lctx.model)) {
+ // first case is an encoder-decoder T5 model where embeddings are passed to decoder
+ embd = gf->nodes[gf->n_nodes - 1];
+ GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
+ } else {
+ // second case is an encoder-only T5 model
+ if (cparams.embeddings) {
+ // only output embeddings if required
+ embd = gf->nodes[gf->n_nodes - 1];
+ if (strcmp(embd->name, "result_embd_pooled") != 0) {
+ embd = gf->nodes[gf->n_nodes - 2];
+ }
+ GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
+ }
+ }
ggml_backend_sched_alloc_graph(lctx.sched, gf);
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
GGML_ASSERT(backend_embd != nullptr);
- // extract token embeddings
- GGML_ASSERT(lctx.embd != nullptr);
+ if (llama_model_has_decoder(&lctx.model)) {
+ lctx.embd_enc.resize(n_tokens*n_embd);
+ float * embd_out = lctx.embd_enc.data();
- lctx.embd_enc.resize(n_tokens*n_embd);
- float * embd_out = lctx.embd_enc.data();
+ ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
- ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
+ // remember the sequence ids used during the encoding - needed for cross attention later
+ lctx.seq_ids_enc.resize(n_tokens);
+ for (uint32_t i = 0; i < n_tokens; i++) {
+ for (int s = 0; s < batch.n_seq_id[i]; s++) {
+ llama_seq_id seq_id = batch.seq_id[i][s];
+ lctx.seq_ids_enc[i].insert(seq_id);
+ }
+ }
+ } else {
+ GGML_ASSERT(lctx.embd != nullptr);
- // remember the sequence ids used during the encoding - needed for cross attention later
- lctx.seq_ids_enc.resize(n_tokens);
- for (uint32_t i = 0; i < n_tokens; i++) {
- for (int s = 0; s < batch.n_seq_id[i]; s++) {
- llama_seq_id seq_id = batch.seq_id[i][s];
- lctx.seq_ids_enc[i].insert(seq_id);
+ switch (cparams.pooling_type) {
+ case LLAMA_POOLING_TYPE_NONE:
+ {
+ // extract token embeddings
+ GGML_ASSERT(lctx.embd != nullptr);
+ float * embd_out = lctx.embd;
+
+ GGML_ASSERT(n_tokens*n_embd <= (int64_t) lctx.embd_size);
+ ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
+ } break;
+ case LLAMA_POOLING_TYPE_MEAN:
+ case LLAMA_POOLING_TYPE_CLS:
+ case LLAMA_POOLING_TYPE_LAST:
+ {
+ // extract sequence embeddings
+ auto & embd_seq_out = lctx.embd_seq;
+ embd_seq_out.clear();
+
+ for (uint32_t i = 0; i < n_tokens; i++) {
+ const llama_seq_id seq_id = batch.seq_id[i][0];
+ if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
+ continue;
+ }
+ embd_seq_out[seq_id].resize(n_embd);
+ ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
+ }
+ } break;
+ case LLAMA_POOLING_TYPE_UNSPECIFIED:
+ {
+ GGML_ABORT("unknown pooling type");
+ }
}
}
}
ctx->sampling.rng = std::mt19937(params.seed);
ctx->logits_all = params.logits_all;
+ // build worst-case graph for encoder if a model contains encoder
+ ctx->is_encoding = llama_model_has_encoder(model);
uint32_t kv_size = cparams.n_ctx;
ggml_type type_k = params.type_k;
case LLM_ARCH_MAMBA:
case LLM_ARCH_JINA_BERT_V2:
case LLM_ARCH_T5:
+ case LLM_ARCH_T5ENCODER:
case LLM_ARCH_JAIS:
return LLAMA_ROPE_TYPE_NONE;
bool llama_model_has_encoder(const struct llama_model * model) {
switch (model->arch) {
- case LLM_ARCH_T5: return true;
- default: return false;
+ case LLM_ARCH_T5: return true;
+ case LLM_ARCH_T5ENCODER: return true;
+ default: return false;
+ }
+}
+
+bool llama_model_has_decoder(const struct llama_model * model) {
+ switch (model->arch) {
+ case LLM_ARCH_T5ENCODER: return false;
+ default: return true;
}
}