auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
+ lparams.n_gqa = params.n_gqa;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.use_mmap = params.use_mmap;
const auto & kv_self = ctx->kv_self;
const auto & hparams = ctx->model.hparams;
const int n_layer = hparams.n_layer;
- const int n_embd = hparams.n_embd;
+ const int n_embd = hparams.n_embd_gqa();
const int n_ctx = hparams.n_ctx;
const size_t kv_size = kv_self.buf.size;
const auto & kv_self = ctx->kv_self;
const auto & hparams = ctx->model.hparams;
const int n_layer = hparams.n_layer;
- const int n_embd = hparams.n_embd;
+ const int n_embd = hparams.n_embd_gqa();
const int n_ctx = hparams.n_ctx;
size_t kv_size;