static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past, int * st_pos_id) {
int N = (int) tokens.size();
- std::vector<llama_pos> pos;
for (int i = 0; i < N; i += n_batch) {
int n_eval = (int) tokens.size() - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
auto batch = llama_batch_get_one(&tokens[i], n_eval);
- // TODO: add mrope pos ids somewhere else
- pos.resize(batch.n_tokens * 4);
- std::fill(pos.begin(), pos.end(), 0);
- for (int j = 0; j < batch.n_tokens * 3; j ++) {
- pos[j] = *st_pos_id + (j % batch.n_tokens);
- }
- batch.pos = pos.data();
if (llama_decode(ctx_llama, batch)) {
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
if (ubatch->pos && pos) {
const int64_t n_tokens = ubatch->n_tokens;
- ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos));
+ if (ubatch->token && n_pos_per_embd > 1) {
+ // in case we're using M-RoPE with text tokens, convert the 1D positions to 4D
+ // the other dimensions are all 0, they are unused for text tokens
+ std::vector<llama_pos> pos_data(n_tokens*n_pos_per_embd, 0);
+ // copy the first dimension
+ for (int i = 0; i < n_tokens; ++i) {
+ pos_data[i] = ubatch->pos[i];
+ }
+ ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));
+ } else {
+ ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos));
+ }
}
}
) * f_attn_temp_scale + 1.0;
}
- ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(attn_scale));
+ ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*ggml_element_size(attn_scale));
}
}
res (std::make_unique<llm_graph_result>()) {
}
-int64_t llm_graph_context::n_pos_per_token() const {
+int64_t llm_graph_context::n_pos_per_embd() const {
return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
}
}
ggml_tensor * llm_graph_context::build_inp_pos() const {
- auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_token());
+ auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_embd());
auto & cur = inp->pos;
- cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
+ cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd());
ggml_set_input(cur);
res->add_input(std::move(inp));
}
ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
- auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
+ auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
auto & cur = inp->attn_scale;
- cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token());
+ // this need to be 1x1xN for broadcasting
+ cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);
ggml_set_input(cur);
res->add_input(std::move(inp));
class llm_graph_input_pos : public llm_graph_input_i {
public:
- llm_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {}
+ llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
virtual ~llm_graph_input_pos() = default;
void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * pos = nullptr; // I32 [n_batch]
- const int64_t n_pos_per_token = 1;
+ const int64_t n_pos_per_embd = 1;
};
// temperature tuning, used by llama4
class llm_graph_input_attn_temp : public llm_graph_input_i {
public:
- llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
- : n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
+ llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
+ : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
virtual ~llm_graph_input_attn_temp() = default;
void set_input(const llama_ubatch * ubatch) override;
ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
- const int64_t n_pos_per_token = 1;
-
const uint32_t n_attn_temp_floor_scale;
const float f_attn_temp_scale;
};
llm_graph_context(const llm_graph_params & params);
- int64_t n_pos_per_token() const;
+ int64_t n_pos_per_embd() const;
void cb(ggml_tensor * cur, const char * name, int il) const;