printf(" --cfg-negative-prompt-file FNAME\n");
printf(" negative prompt file to use for guidance. (default: empty)\n");
printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
- printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
- printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
- printf(" --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
+ printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
+ printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
+ printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
printf(" --no-penalize-nl do not penalize newline token\n");
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
- printf(" --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
- printf(" --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
+ printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
+ printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
static const size_t MB = kB*kB;
static const size_t GB = kB*kB*kB;
-// default hparams (LLaMA 7B)
struct llama_hparams {
- uint32_t n_vocab = 32000;
- uint32_t n_ctx_train = 2048; // the context size used during training
- uint32_t n_ctx = 512; // the context size used during inference
- uint32_t n_embd = 4096;
- uint32_t n_head = 32;
- uint32_t n_head_kv = 32;
- uint32_t n_layer = 32;
- uint32_t n_rot = 64;
- uint32_t n_ff = 11008;
-
- float f_norm_eps = 1e-5;
- float f_norm_rms_eps = 1e-5;
-
- float rope_freq_base = 10000.0f;
- float rope_freq_scale = 1.0f;
+ uint32_t n_vocab;
+ uint32_t n_ctx_train; // context size the model was trained on
+ uint32_t n_ctx; // context size used during inference
+ uint32_t n_embd;
+ uint32_t n_head;
+ uint32_t n_head_kv;
+ uint32_t n_layer;
+ uint32_t n_rot;
+ uint32_t n_ff;
+
+ float f_norm_eps;
+ float f_norm_rms_eps;
+
+ float rope_freq_base;
+ float rope_freq_scale;
bool operator!=(const llama_hparams & other) const {
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
std::string name = "n/a";
- llama_hparams hparams;
+ llama_hparams hparams = {};
llama_vocab vocab;
struct ggml_tensor * tok_embeddings;
hparams.n_head_kv = hparams.n_head;
GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
- // TODO: manually setting rope freq base and scale should override this
- // FIXME: partial fix when the param specified is not the default value, but
- // will not work for overriding the model value to the params default
-
- llama_context_params defaults = llama_context_default_params();
-
- // rope_freq_base
- {
- float ropebase = 10000.0f;
- GGUF_GET_KEY(ctx, ropebase, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
- if (ropebase != 10000.0f && rope_freq_base == defaults.rope_freq_base) {
- rope_freq_base = ropebase;
- }
+ // rope_freq_base (optional)
+ if (rope_freq_base == 0.0f) {
+ rope_freq_base = 10000.0f;
+ GGUF_GET_KEY(ctx, rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
}
// rope_freq_scale (inverse of the kv) is optional
- {
+ if (rope_freq_scale == 0.0f) {
float ropescale = 1.0f;
GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
- if (ropescale != 1.0f && rope_freq_scale == defaults.rope_freq_scale) {
- rope_freq_scale = 1.0f/ropescale;
- }
+ rope_freq_scale = 1.0f/ropescale;
}
// sanity check for n_rot (optional)
/*.n_gpu_layers =*/ 0,
/*.main_gpu =*/ 0,
/*.tensor_split =*/ nullptr,
- /*.rope_freq_base =*/ 10000.0f,
- /*.rope_freq_scale =*/ 1.0f,
+ /*.rope_freq_base =*/ 0.0f,
+ /*.rope_freq_scale =*/ 0.0f,
/*.progress_callback =*/ nullptr,
/*.progress_callback_user_data =*/ nullptr,
/*.low_vram =*/ false,