llama.cpp : add documentation about rope_freq_base and scale values (#3401)

author slaren <redacted>

Fri, 29 Sep 2023 16:42:32 +0000 (18:42 +0200)

committer GitHub <redacted>

Fri, 29 Sep 2023 16:42:32 +0000 (18:42 +0200)
author slaren <redacted>
Fri, 29 Sep 2023 16:42:32 +0000 (18:42 +0200)
committer GitHub <redacted>
Fri, 29 Sep 2023 16:42:32 +0000 (18:42 +0200)
diff --git a/README.md b/README.md

index 75b6075f2816da27c416c615ae107b06077b4c55..ec7b5894327ed64492b017acda44a15df28c300f 100644 (file)
--- a/README.md
+++ b/README.md
@@ -11,6 +11,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
  
  ### Hot topics
  
+- ‼️ Breaking change: `rope_freq_base` and `rope_freq_scale` must be set to zero to use the model default values: [#3401](https://github.com/ggerganov/llama.cpp/pull/3401)
  - Parallel decoding + continuous batching support added: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \
    **Devs should become familiar with the new API**
  - Local Falcon 180B inference on Mac Studio
diff --git a/llama.h b/llama.h

index 96ff1f09c76dbd88838fb8398cbe0f13f269d463..fde4d6eca0d20e76e3ed22b6afd3f91f8c151fc8 100644 (file)
--- a/llama.h
+++ b/llama.h
@@ -167,18 +167,18 @@ extern "C" {
  
      struct llama_context_params {
          uint32_t seed;            // RNG seed, -1 for random
-        uint32_t n_ctx;           // text context
-        uint32_t n_batch;         // prompt processing batch size
+        uint32_t n_ctx;           // text context, 0 = from model
+        uint32_t n_batch;         // prompt processing maximum batch size
          uint32_t n_threads;       // number of threads to use for generation
          uint32_t n_threads_batch; // number of threads to use for batch processing
  
          // ref: https://github.com/ggerganov/llama.cpp/pull/2054
-        float rope_freq_base;  // RoPE base frequency
-        float rope_freq_scale; // RoPE frequency scaling factor
+        float rope_freq_base;  // RoPE base frequency, 0 = from model
+        float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
  
          // Keep the booleans together to avoid misalignment during copy-by-value.
          bool mul_mat_q;  // if true, use experimental mul_mat_q kernels
-        bool f16_kv;     // use fp16 for KV cache
+        bool f16_kv;     // use fp16 for KV cache, fp32 otherwise
          bool logits_all; // the llama_eval() call computes all logits, not just the last one
          bool embedding;  // embedding mode only
      };
author	slaren <redacted>
	Fri, 29 Sep 2023 16:42:32 +0000 (18:42 +0200)
committer	GitHub <redacted>
	Fri, 29 Sep 2023 16:42:32 +0000 (18:42 +0200)
README.md		patch \| blob \| history
llama.h		patch \| blob \| history