llama : remove mtest (#3177)

author Roland <redacted>

Fri, 15 Sep 2023 07:28:45 +0000 (03:28 -0400)

committer GitHub <redacted>

Fri, 15 Sep 2023 07:28:45 +0000 (10:28 +0300)
author Roland <redacted>
Fri, 15 Sep 2023 07:28:45 +0000 (03:28 -0400)
committer GitHub <redacted>
Fri, 15 Sep 2023 07:28:45 +0000 (10:28 +0300)
diff --git a/common/common.cpp b/common/common.cpp

index afc9b8a55bfae60ebecee5bf1ffd536f750c966e..9969cb97d3c2aea44bab66d71544c737f574b080 100644 (file)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -434,8 +434,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
  #endif // GGML_USE_CUBLAS
          } else if (arg == "--no-mmap") {
              params.use_mmap = false;
-        } else if (arg == "--mtest") {
-            params.mem_test = true;
          } else if (arg == "--numa") {
              params.numa = true;
          } else if (arg == "--export") {
@@ -687,7 +685,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
      printf("                        Not recommended since this is both slower and uses more VRAM.\n");
  #endif // GGML_USE_CUBLAS
  #endif
-    printf("  --mtest               compute maximum memory usage\n");
      printf("  --export              export the computation graph to 'llama.ggml'\n");
      printf("  --verbose-prompt      print prompt before generation\n");
      fprintf(stderr, "  --simple-io           use basic IO for better compatibility in subprocesses and limited consoles\n");
@@ -1225,7 +1222,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
      fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
      fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
      fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
-    fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false");
      fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
      fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
      fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
diff --git a/common/common.h b/common/common.h

index 238635ae3065da6d4fa2de6d557f28b35ead94ba..4979f99ddbc236c82eced192350d50e3b39b970c 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -110,7 +110,6 @@ struct gpt_params {
      bool perplexity        = false; // compute perplexity over the prompt
      bool use_mmap          = true;  // use mmap for faster loads
      bool use_mlock         = false; // use mlock to keep model in memory
-    bool mem_test          = false; // compute maximum memory usage
      bool numa              = false; // attempt optimizations that help on some NUMA systems
      bool export_cgraph     = false; // export the computation graph
      bool verbose_prompt    = false; // print prompt tokens before generation
diff --git a/examples/main/README.md b/examples/main/README.md

index 2773fe976b57d7cafb06e5d45b2830d50238c83b..26e1e28dd08c179a8726ee8eff8b8d61ac56af8d 100644 (file)
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -144,7 +144,7 @@ The `--ctx-size` option allows you to set the size of the prompt context used by
  
  Some fine-tuned models have extened the context length by scaling RoPE. For example, if the original pretrained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
  
-- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
+-   `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
  
  ### Keep Prompt
  
@@ -274,7 +274,7 @@ These options help improve the performance and memory usage of the LLaMA models.
  
  ### NUMA support
  
--   `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop\_caches' as root.
+-   `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
  
  ### Memory Float 32
  
@@ -302,7 +302,6 @@ These options provide extra functionality and customization when running the LLa
  
  -   `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
  -   `--verbose-prompt`: Print the prompt before generating text.
--   `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly.
  -   `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
  -   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
  -   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
diff --git a/examples/main/main.cpp b/examples/main/main.cpp

index baec6ba129da0b744fbef903d903d5719f50772f..a8179f1bf011fa3bfda6c43d553923b16a855356 100644 (file)
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -198,23 +198,6 @@ int main(int argc, char ** argv) {
                  params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
      }
  
-    // determine the maximum memory usage needed to do inference for the given n_batch and n_ctx parameters
-    // uncomment the "used_mem" line in llama.cpp to see the results
-    if (params.mem_test) {
-        {
-            LOG_TEE("%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);
-
-            const std::vector<llama_token> tmp(params.n_batch, llama_token_bos(ctx));
-            llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
-        }
-
-        llama_print_timings(ctx);
-        llama_free(ctx);
-        llama_free_model(model);
-
-        return 0;
-    }
-
      // export the cgraph and exit
      if (params.export_cgraph) {
          llama_eval_export(ctx, "llama.ggml");
diff --git a/run_with_preset.py b/run_with_preset.py

index 8f90f52a9586e9047ec7e2f171bfd2a17740bc94..9b4d7ecbe82d4381b08d696263030092395f10be 100755 (executable)
--- a/run_with_preset.py
+++ b/run_with_preset.py
@@ -13,7 +13,7 @@ CLI_ARGS_MAIN_PERPLEXITY = [
      "hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "instruct",
      "interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base",
      "low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
-    "model", "mtest", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q",
+    "model", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q",
      "np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt",
      "prompt-cache", "prompt-cache-all", "prompt-cache-ro", "random-prompt", "repeat-last-n",
      "repeat-penalty", "reverse-prompt", "rope-freq-base", "rope-freq-scale", "rope-scale", "seed",
author	Roland <redacted>
	Fri, 15 Sep 2023 07:28:45 +0000 (03:28 -0400)
committer	GitHub <redacted>
	Fri, 15 Sep 2023 07:28:45 +0000 (10:28 +0300)
common/common.cpp		patch \| blob \| history
common/common.h		patch \| blob \| history
examples/main/README.md		patch \| blob \| history
examples/main/main.cpp		patch \| blob \| history
run_with_preset.py		patch \| blob \| history