const llama_seq_id seq_id = ubatch.seq_id_unq[s];
const int32_t seq_idx = ubatch.seq_idx[seq_id];
- embd_seq_out[seq_id].resize(n_embd);
- ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_idx)*sizeof(float), n_embd*sizeof(float));
+ // use n_embd_out (not n_embd_inp) - the pooled embedding has the model's
+ // output dimension, which differs from input dimension for deepstack models (e.g. qwen3vl)
+ const uint32_t n_embd_out = hparams.n_embd_out();
+ embd_seq_out[seq_id].resize(n_embd_out);
+ ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd_out*seq_idx)*sizeof(float), n_embd_out*sizeof(float));
}
} break;
case LLAMA_POOLING_TYPE_RANK:
// extract sequence embeddings (cleared before processing each batch)
auto & embd_seq_out = embd_seq;
+ // use n_embd_out (not n_embd_inp) - the pooled embedding has the model's
+ // output dimension, which differs from input dimension for deepstack models (e.g. qwen3vl)
+ const uint32_t n_embd_out = hparams.n_embd_out();
+
for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
const llama_seq_id seq_id = ubatch.seq_id_unq[s];
const int32_t seq_idx = ubatch.seq_idx[seq_id];
- embd_seq_out[seq_id].resize(n_embd);
- ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_idx)*sizeof(float), n_embd*sizeof(float));
+ embd_seq_out[seq_id].resize(n_embd_out);
+ ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd_out*seq_idx)*sizeof(float), n_embd_out*sizeof(float));
}
} break;
case LLAMA_POOLING_TYPE_RANK: