readme : remove --memory-f32 references (#9925)

author Georgi Gerganov <redacted>

Thu, 17 Oct 2024 20:43:05 +0000 (23:43 +0300)

committer Georgi Gerganov <redacted>

Thu, 17 Oct 2024 20:43:05 +0000 (23:43 +0300)
author Georgi Gerganov <redacted>
Thu, 17 Oct 2024 20:43:05 +0000 (23:43 +0300)
committer Georgi Gerganov <redacted>
Thu, 17 Oct 2024 20:43:05 +0000 (23:43 +0300)
diff --git a/examples/main/README.md b/examples/main/README.md

index 620934dad4ad559fd38d28dbe0a32deb12acfbf9..7e192b9f2837c6e170b3267bfa51813a48379b87 100644 (file)
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -297,10 +297,6 @@ These options help improve the performance and memory usage of the LLaMA models.
  
   These flags attempt optimizations that help on some systems with non-uniform memory access. This currently consists of one of the above strategies, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
  
-### Memory Float 32
-
--   `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended.
-
  ### Batch Size
  
  -   `-b N, --batch-size N`: Set the batch size for prompt processing (default: `2048`). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
diff --git a/scripts/run-with-preset.py b/scripts/run-with-preset.py

index ee21eab371418ac469d237a4188d83d65583f6e9..47cacb432158907f2b886c4c26016b2caff6af84 100755 (executable)
--- a/scripts/run-with-preset.py
+++ b/scripts/run-with-preset.py
@@ -15,7 +15,7 @@ CLI_ARGS_LLAMA_CLI_PERPLEXITY = [
      "export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
      "hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix",
      "interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base",
-    "low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
+    "low-vram", "main-gpu", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
      "model", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q",
      "np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt",
      "prompt-cache", "prompt-cache-all", "prompt-cache-ro", "repeat-last-n",
@@ -25,12 +25,12 @@ CLI_ARGS_LLAMA_CLI_PERPLEXITY = [
  ]
  
  CLI_ARGS_LLAMA_BENCH = [
-    "batch-size", "memory-f32", "low-vram", "model", "mul-mat-q", "n-gen", "n-gpu-layers",
+    "batch-size", "low-vram", "model", "mul-mat-q", "n-gen", "n-gpu-layers",
      "n-prompt", "output", "repetitions", "tensor-split", "threads", "verbose"
  ]
  
  CLI_ARGS_LLAMA_SERVER = [
-    "alias", "batch-size", "ctx-size", "embedding", "host", "memory-f32", "lora", "lora-base",
+    "alias", "batch-size", "ctx-size", "embedding", "host", "lora", "lora-base",
      "low-vram", "main-gpu", "mlock", "model", "n-gpu-layers", "n-probs", "no-mmap", "no-mul-mat-q",
      "numa", "path", "port", "rope-freq-base", "timeout", "rope-freq-scale", "tensor-split",
      "threads", "verbose"
author	Georgi Gerganov <redacted>
	Thu, 17 Oct 2024 20:43:05 +0000 (23:43 +0300)
committer	Georgi Gerganov <redacted>
	Thu, 17 Oct 2024 20:43:05 +0000 (23:43 +0300)
examples/main/README.md		patch \| blob \| history
scripts/run-with-preset.py		patch \| blob \| history