return lines;
}
-static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
- for (size_t i = 0; i < tokens.size(); i++) {
- llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
+static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
+ size_t n_tokens = tokens.size();
+ for (size_t i = 0; i < n_tokens; i++) {
+ llama_batch_add(batch, tokens[i], i, { seq_id }, true);
}
}
// try to get sequence embeddings - supported only when pooling_type is not NONE
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
- if (embd == NULL) {
- embd = llama_get_embeddings_ith(ctx, i);
- if (embd == NULL) {
- fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
- continue;
- }
- }
+ GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
float * out = output + batch.seq_id[i][0] * n_embd;
//TODO: I would also add a parameter here to enable normalization or not.
const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);
+ const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
+ if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
+ fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
+ return 1;
+ }
+
if (n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, n_ctx);
return lctx.inp_s_seq;
}
+ struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
+ // find result_norm tensor for input
+ struct ggml_tensor * inp = nullptr;
+ for (int i = gf->n_nodes - 1; i >= 0; --i) {
+ inp = gf->nodes[i];
+ if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
+ break;
+ } else {
+ inp = nullptr;
+ }
+ }
+ GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
+
+ struct ggml_tensor * cur;
+
+ switch (pooling_type) {
+ case LLAMA_POOLING_TYPE_MEAN:
+ {
+ struct ggml_tensor * inp_mean = build_inp_mean();
+ cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean);
+ } break;
+ case LLAMA_POOLING_TYPE_CLS:
+ case LLAMA_POOLING_TYPE_LAST:
+ {
+ struct ggml_tensor * inp_cls = build_inp_cls();
+ cur = ggml_get_rows(ctx0, inp, inp_cls);
+ } break;
+ case LLAMA_POOLING_TYPE_NONE:
+ {
+ cur = inp;
+ } break;
+ default:
+ {
+ GGML_ASSERT(false && "unknown pooling type");
+ } break;
+ }
+
+ cb(cur, "result_embd_pooled", -1);
+
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+ }
+
struct ggml_cgraph * build_llama() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
if (model.arch != LLM_ARCH_JINA_BERT_V2) {
inp_pos = build_inp_pos();
}
- struct ggml_tensor * inp_mean = build_inp_mean();
- struct ggml_tensor * inp_cls = build_inp_cls();
// construct input embeddings (token, type, position)
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
cur = inpL;
cb(cur, "result_embd", -1);
- // pooling layer
- switch (pooling_type) {
- case LLAMA_POOLING_TYPE_NONE:
- {
- // nop
- } break;
- case LLAMA_POOLING_TYPE_MEAN:
- {
- cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
- cb(cur, "result_embd_pooled", -1);
- } break;
- case LLAMA_POOLING_TYPE_CLS:
- {
- cur = ggml_get_rows(ctx0, cur, inp_cls);
- cb(cur, "result_embd_pooled", -1);
- } break;
- case LLAMA_POOLING_TYPE_UNSPECIFIED:
- {
- GGML_ASSERT(false && "Invalid pooling type");
- } break;
- }
-
ggml_build_forward_expand(gf, cur);
return gf;
GGML_ASSERT(false);
}
+ // add on pooling layer
+ if (lctx.cparams.embeddings) {
+ result = llm.append_pooling(result);
+ }
+
llm.free();
return result;
// (!a || b) is a logical implication (a -> b)
// !hparams.causal_attn -> !cparams.causal_attn
(hparams.causal_attn || !cparams.causal_attn) &&
- "causal attention with embedding models is not supported"
+ "causal attention is not supported by this model"
);
if (lctx.inp_KQ_mask) {
}
}
+ if (cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
+ const int64_t n_tokens = batch.n_tokens;
+
+ GGML_ASSERT(lctx.inp_cls);
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
+
+ uint32_t * data = (uint32_t *) lctx.inp_cls->data;
+ memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
+
+ std::vector<int> last_pos(n_tokens, -1);
+ std::vector<int> last_row(n_tokens, -1);
+
+ for (int i = 0; i < n_tokens; ++i) {
+ const llama_seq_id seq_id = batch.seq_id[i][0];
+ const llama_pos pos = batch.pos[i];
+
+ GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
+
+ if (pos >= last_pos[seq_id]) {
+ last_pos[seq_id] = pos;
+ last_row[seq_id] = i;
+ }
+ }
+
+ for (int i = 0; i < n_tokens; ++i) {
+ if (last_row[i] >= 0) {
+ data[i] = last_row[i];
+ }
+ }
+ }
+
if (kv_self.recurrent) {
const int64_t n_kv = kv_self.n;
const auto n_embd = hparams.n_embd;
// TODO: use a per-batch flag for logits presence instead
- const bool has_logits = cparams.causal_attn;
- const bool has_embd = cparams.embeddings && (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
+ const bool has_logits = !cparams.embeddings;
+ const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
std::vector<std::vector<llama_seq_id>> seq_id;
// count outputs
- if (batch_all.logits) {
+ if (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE) {
+ n_outputs = n_tokens_all;
+ } else if (batch_all.logits) {
for (uint32_t i = 0; i < n_tokens_all; ++i) {
n_outputs += batch_all.logits[i] != 0;
}
- } else if (lctx.logits_all || (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE)) {
+ } else if (lctx.logits_all) {
n_outputs = n_tokens_all;
} else {
// keep last output only
// no output
res = nullptr;
embd = nullptr;
- } else if (!hparams.causal_attn) {
- res = nullptr; // do not extract logits for embedding models such as BERT
-
- // token or sequence embeddings
- embd = gf->nodes[gf->n_nodes - 1];
-
- GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
} else if (cparams.embeddings) {
- // the embeddings could be in the second to last tensor, or any of the previous tensors
- int i_embd = gf->n_nodes - 2;
- for (int i = 3; strcmp(embd->name, "result_norm") != 0; ++i) {
- i_embd = gf->n_nodes - i;
- if (i_embd < 0) { break; }
- embd = gf->nodes[i_embd];
- }
- GGML_ASSERT(i_embd >= 0 && "missing result_norm tensor");
-
- // TODO: use a per-batch flag to know when to skip logits while keeping embeddings
- if (!cparams.causal_attn) {
- res = nullptr; // do not extract logits when not needed
- // skip computing logits
- // TODO: is this safe?
- gf->n_nodes = i_embd + 1;
+ res = nullptr; // do not extract logits for embedding case
+ embd = gf->nodes[gf->n_nodes - 1];
+ if (strcmp(embd->name, "result_embd_pooled") != 0) {
+ embd = gf->nodes[gf->n_nodes - 2];
}
+ GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
} else {
embd = nullptr; // do not extract embeddings when not needed
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
}
} break;
- case LLAMA_POOLING_TYPE_CLS:
case LLAMA_POOLING_TYPE_MEAN:
+ case LLAMA_POOLING_TYPE_CLS:
+ case LLAMA_POOLING_TYPE_LAST:
{
- GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
-
// extract sequence embeddings
auto & embd_seq_out = lctx.embd_seq;
embd_seq_out.clear();
ctx->abort_callback_data = abort_callback_data;
}
+void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
+ ctx->cparams.embeddings = embeddings;
+}
+
void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
ctx->cparams.causal_attn = causal_attn;
}