llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
- float kq_scale = 1.0f / sqrtf(float(n_embd_head));
+ const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
- ggml_tensor *inpL, *cur;
+ ggml_tensor * inpL;
+ ggml_tensor * cur;
+
inpL = build_inp_embd(model.tok_embd);
ggml_tensor * inp_pos = build_inp_pos();
}
ggml_tensor * inpSA = inpL;
- cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+ cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
// build self attention
{
#include "models.h"
-
-
llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_k;
inpL = build_inp_embd(model.tok_embd);
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
- if (ubatch.token) {
- inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
- cb(inpL, "inp_scaled", -1);
- }
+ inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
+ cb(inpL, "inp_scaled", -1);
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
inpL = build_inp_embd(model.tok_embd);
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
- if (ubatch.token) {
- inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
- cb(inpL, "inp_scaled", -1);
- }
+ inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
+ cb(inpL, "inp_scaled", -1);
+
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
#include "models.h"
-
-
llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params),
model(model),
inpL = build_inp_embd(model.tok_embd);
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
- if (ubatch.token) {
- inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
- cb(inpL, "inp_scaled", -1);
- }
+ inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
+ cb(inpL, "inp_scaled", -1);
+
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
// equivalent to get_per_layer_inputs() in python code
// output shape: [n_embd_altup, n_layer, n_tokens]
ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
- auto inp = std::make_unique<llm_graph_input_embd>();
+ auto inp = std::make_unique<llm_graph_input_embd>();
ggml_tensor * inp_per_layer;
if (ubatch.token) {
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);