imatrix : migrate to gpt_params (#7771)

author Georgi Gerganov <redacted>

Thu, 6 Jun 2024 13:30:58 +0000 (16:30 +0300)

committer GitHub <redacted>

Thu, 6 Jun 2024 13:30:58 +0000 (16:30 +0300)
author Georgi Gerganov <redacted>
Thu, 6 Jun 2024 13:30:58 +0000 (16:30 +0300)
committer GitHub <redacted>
Thu, 6 Jun 2024 13:30:58 +0000 (16:30 +0300)
diff --git a/common/common.cpp b/common/common.cpp

index c8df9a4ce8ef5618bd7bd65ae43ae23edd60fba0..601bd216430db8509dd69ac9ae046ce192336ac9 100644 (file)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -273,6 +273,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
          }
      } catch (const std::invalid_argument & ex) {
          fprintf(stderr, "%s\n", ex.what());
+        params = params_org;
          return false;
      }
  
@@ -408,6 +409,20 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
          }
          return true;
      }
+    if (arg == "--in-file") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        std::ifstream file(argv[i]);
+        if (!file) {
+            fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+            invalid_param = true;
+            return true;
+        }
+        params.in_files.push_back(argv[i]);
+        return true;
+    }
      if (arg == "-n" || arg == "--predict" || arg == "--n-predict") {
          if (++i >= argc) {
              invalid_param = true;
@@ -1081,7 +1096,15 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
          return true;
      }
      if (arg == "-v" || arg == "--verbose") {
-        params.verbose = true;
+        params.verbosity = 1;
+        return true;
+    }
+    if (arg == "--verbosity") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.verbosity = std::stoi(argv[i]);
          return true;
      }
      if (arg == "--verbose-prompt") {
@@ -1537,6 +1560,46 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
          params.i_pos = std::stoi(argv[i]);
          return true;
      }
+    if (arg == "-o" || arg == "--output" || arg == "--output-file") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.out_file = argv[i];
+        return true;
+    }
+    if (arg == "-ofreq" || arg == "--output-frequency") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.n_out_freq = std::stoi(argv[i]);
+        return true;
+    }
+    if (arg == "--save-frequency") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.n_save_freq = std::stoi(argv[i]);
+        return true;
+    }
+    if (arg == "--process-output") {
+        params.process_output = true;
+        return true;
+    }
+    if (arg == "--no-ppl") {
+        params.compute_ppl = false;
+        return true;
+    }
+    if (arg == "--chunk" || arg == "--from-chunk") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.i_chunk = std::stoi(argv[i]);
+        return true;
+    }
  #ifndef LOG_DISABLE_LOGS
      // Parse args for logging parameters
      if (log_param_single_parse(argv[i])) {
@@ -1612,6 +1675,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
      options.push_back({ "*",           "-h,    --help, --usage",        "print usage and exit" });
      options.push_back({ "*",           "       --version",              "show version and build info" });
      options.push_back({ "*",           "-v,    --verbose",              "print verbose information" });
+    options.push_back({ "*",           "       --verbosity N",          "set specific verbosity level (default: %d)", params.verbosity });
      options.push_back({ "*",           "       --verbose-prompt",       "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" });
      options.push_back({ "*",           "       --no-display-prompt",    "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
      options.push_back({ "*",           "-co,   --color",                "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
@@ -1637,6 +1701,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
      options.push_back({ "*",           "-fa,   --flash-attn",           "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
      options.push_back({ "*",           "-p,    --prompt PROMPT",        "prompt to start generation with (default: '%s')", params.prompt.c_str() });
      options.push_back({ "*",           "-f,    --file FNAME",           "a file containing the prompt (default: none)" });
+    options.push_back({ "*",           "       --in-file FNAME",        "an input file (repeat to specify multiple files)" });
      options.push_back({ "*",           "-bf,   --binary-file FNAME",    "binary file containing the prompt (default: none)" });
      options.push_back({ "*",           "-e,    --escape",               "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" });
      options.push_back({ "*",           "       --no-escape",            "do not process escape sequences" });
@@ -1804,6 +1869,14 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
      options.push_back({ "passkey",     "       --junk N",               "number of times to repeat the junk text (default: %d)", params.n_junk });
      options.push_back({ "passkey",     "       --pos N",                "position of the passkey in the junk text (default: %d)", params.i_pos });
  
+    options.push_back({ "imatrix" });
+    options.push_back({ "imatrix",     "-o,    --output FNAME",         "output file (default: '%s')", params.out_file.c_str() });
+    options.push_back({ "imatrix",     "       --output-frequency N",   "output the imatrix every N iterations (default: %d)", params.n_out_freq });
+    options.push_back({ "imatrix",     "       --save-frequency N",     "save an imatrix copy every N iterations (default: %d)", params.n_save_freq });
+    options.push_back({ "imatrix",     "       --process-output",       "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false" });
+    options.push_back({ "imatrix",     "       --no-ppl",               "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false" });
+    options.push_back({ "imatrix",     "       --chunk N",              "start processing the input from chunk N (default: %d)", params.i_chunk });
+
      options.push_back({ "bench" });
      options.push_back({ "bench",       "-pps",                          "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" });
      options.push_back({ "bench",       "-npp n0,n1,...",                "number of prompt tokens" });
diff --git a/common/common.h b/common/common.h

index e0a08a61b7424a9bc6bcc4106be04ed5d47b0e5c..de6238e27f7568ca57e4d8587986f6a9111fb6b4 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -56,43 +56,42 @@ struct gpt_params {
      uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
  
      int32_t n_threads             = cpu_get_num_math();
-    int32_t n_threads_draft       = -1;
-    int32_t n_threads_batch       = -1;    // number of threads to use for batch processing (-1 = use n_threads)
-    int32_t n_threads_batch_draft = -1;
-    int32_t n_predict             = -1;    // new tokens to predict
-    int32_t n_ctx                 = 0;     // context size
-    int32_t n_batch               = 2048;  // logical batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_ubatch              = 512;   // physical batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_keep                = 0;     // number of tokens to keep from initial prompt
-    int32_t n_draft               = 5;     // number of tokens to draft during speculative decoding
-    int32_t n_chunks              = -1;    // max number of chunks to process (-1 = unlimited)
-    int32_t n_parallel            = 1;     // number of parallel sequences to decode
-    int32_t n_sequences           = 1;     // number of sequences to decode
-    float   p_split               = 0.1f;  // speculative decoding split probability
-    int32_t n_gpu_layers          = -1;    // number of layers to store in VRAM (-1 - use default)
-    int32_t n_gpu_layers_draft    = -1;    // number of layers to store in VRAM for the draft model (-1 - use default)
-    llama_split_mode split_mode   = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
-    int32_t main_gpu              = 0;     // the GPU that is used for scratch and small tensors
-    float   tensor_split[128]     = {0};   // how split tensors should be distributed across GPUs
-    int32_t n_beams               = 0;     // if non-zero then use beam search of given width.
-    int32_t grp_attn_n            = 1;     // group-attention factor
-    int32_t grp_attn_w            = 512;   // group-attention width
-    int32_t n_print               = -1;    // print token count every n tokens (-1 = disabled)
-    float   rope_freq_base        = 0.0f;  // RoPE base frequency
-    float   rope_freq_scale       = 0.0f;  // RoPE frequency scaling factor
+    int32_t n_threads_draft       =    -1;
+    int32_t n_threads_batch       =    -1; // number of threads to use for batch processing (-1 = use n_threads)
+    int32_t n_threads_batch_draft =    -1;
+    int32_t n_predict             =    -1; // new tokens to predict
+    int32_t n_ctx                 =     0; // context size
+    int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_ubatch              =   512; // physical batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_keep                =     0; // number of tokens to keep from initial prompt
+    int32_t n_draft               =     5; // number of tokens to draft during speculative decoding
+    int32_t n_chunks              =    -1; // max number of chunks to process (-1 = unlimited)
+    int32_t n_parallel            =     1; // number of parallel sequences to decode
+    int32_t n_sequences           =     1; // number of sequences to decode
+    float   p_split               =  0.1f; // speculative decoding split probability
+    int32_t n_gpu_layers          =    -1; // number of layers to store in VRAM (-1 - use default)
+    int32_t n_gpu_layers_draft    =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
+    int32_t main_gpu              =     0; // the GPU that is used for scratch and small tensors
+    float   tensor_split[128]     =   {0}; // how split tensors should be distributed across GPUs
+    int32_t n_beams               =     0; // if non-zero then use beam search of given width.
+    int32_t grp_attn_n            =     1; // group-attention factor
+    int32_t grp_attn_w            =   512; // group-attention width
+    int32_t n_print               =    -1; // print token count every n tokens (-1 = disabled)
+    float   rope_freq_base        =  0.0f; // RoPE base frequency
+    float   rope_freq_scale       =  0.0f; // RoPE frequency scaling factor
      float   yarn_ext_factor       = -1.0f; // YaRN extrapolation mix factor
-    float   yarn_attn_factor      = 1.0f;  // YaRN magnitude scaling factor
+    float   yarn_attn_factor      =  1.0f; // YaRN magnitude scaling factor
      float   yarn_beta_fast        = 32.0f; // YaRN low correction dim
-    float   yarn_beta_slow        = 1.0f;  // YaRN high correction dim
-    int32_t yarn_orig_ctx         = 0;     // YaRN original context length
+    float   yarn_beta_slow        =  1.0f; // YaRN high correction dim
+    int32_t yarn_orig_ctx         =     0; // YaRN original context length
      float   defrag_thold          = -1.0f; // KV cache defragmentation threshold
-    std::string rpc_servers       = "";    // comma separated list of RPC servers
  
      ggml_backend_sched_eval_callback cb_eval = nullptr;
      void * cb_eval_user_data                 = nullptr;
  
      ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
  
+    enum llama_split_mode        split_mode        = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
      enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
      enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
  
@@ -114,7 +113,9 @@ struct gpt_params {
      std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding
      std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
      std::string logits_file          = ""; // file for saving *all* logits
+    std::string rpc_servers          = ""; // comma separated list of RPC servers
  
+    std::vector<std::string> in_files;   // all input files
      std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
      std::vector<llama_model_kv_override> kv_overrides;
  
@@ -124,23 +125,24 @@ struct gpt_params {
  
      std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
  
+    int32_t verbosity                  = 0;
      int32_t control_vector_layer_start = -1; // layer range for control vector
      int32_t control_vector_layer_end   = -1; // layer range for control vector
  
-    int32_t ppl_stride      = 0;    // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
-    int32_t ppl_output_type = 0;    // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
-                                    //                                       (which is more convenient to use for plotting)
-                                    //
-    bool   hellaswag       = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
-    size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score
+    int32_t ppl_stride      = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
+    int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
+                                     //                                       (which is more convenient to use for plotting)
+                                     //
+    bool   hellaswag        = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
+    size_t hellaswag_tasks  = 400;   // number of tasks to use when computing the HellaSwag score
  
-    bool   winogrande      = false; // compute Winogrande score over random tasks from datafile supplied in prompt
-    size_t winogrande_tasks= 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
+    bool   winogrande       = false; // compute Winogrande score over random tasks from datafile supplied in prompt
+    size_t winogrande_tasks = 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
  
-    bool   multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
-    size_t multiple_choice_tasks = 0;     // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
+    bool   multiple_choice  = false;  // compute TruthfulQA score over random tasks from datafile supplied in prompt
+    size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
  
-    bool   kl_divergence   = false; // compute KL divergence
+    bool   kl_divergence    = false; // compute KL divergence
  
      bool usage             = false; // print usage
      bool use_color         = false; // use color to distinguish generations and inputs
@@ -163,7 +165,6 @@ struct gpt_params {
      bool logits_all        = false; // return logits for all tokens in the batch
      bool use_mmap          = true;  // use mmap for faster loads
      bool use_mlock         = false; // use mlock to keep model in memory
-    bool verbose           = false;
      bool verbose_prompt    = false; // print prompt tokens before generation
      bool display_prompt    = true;  // print prompt before generation
      bool infill            = false; // use infill mode
@@ -180,10 +181,10 @@ struct gpt_params {
      std::vector<std::string> image; // path to image file(s)
  
      // server params
-    int32_t port           = 8080;
-    int32_t timeout_read   = 600;
-    int32_t timeout_write  = timeout_read;
-    int32_t n_threads_http = -1;
+    int32_t port           = 8080;         // server listens on this network port
+    int32_t timeout_read   = 600;          // http read timeout in seconds
+    int32_t timeout_write  = timeout_read; // http write timeout in seconds
+    int32_t n_threads_http = -1;           // number of threads to use for http server (-1 = use n_threads)
  
      std::string hostname      = "127.0.0.1";
      std::string public_path   = "";
@@ -219,6 +220,16 @@ struct gpt_params {
      // passkey params
      int32_t n_junk = 250; // number of times to repeat the junk text
      int32_t i_pos  = -1;  // position of the passkey in the junk text
+
+    // imatrix params
+    std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
+
+    int32_t n_out_freq  = 10; // output the imatrix every n_out_freq iterations
+    int32_t n_save_freq =  0; // save the imatrix every n_save_freq iterations
+    int32_t i_chunk     =  0; // start processing from this chunk
+
+    bool process_output = false; // collect data for the output tensor
+    bool compute_ppl    = true;  // whether to compute perplexity
  };
  
  void gpt_params_handle_model_default(gpt_params & params);
diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md

index 458c01b8751f17097a24623c00319ad7f65b577b..866ca9f5682ac863eca796b4a59e6d103bcccdfc 100644 (file)
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@@ -6,16 +6,19 @@ More information is available here: https://github.com/ggerganov/llama.cpp/pull/
  ## Usage
  
  ```
-./imatrix -m <some_fp_model> -f <some_training_data> [-o <output_file>] [--verbosity <verbosity_level>]
-        [-ofreq num_chunks] [-ow <0 or 1>] [other common params]
+./imatrix \
+    -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
+    [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
+    [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
  ```
  
  Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory.
  The parameters in square brackets are optional and have the following meaning:
  * `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used.
  * `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
-* `-ofreq` (or `--output-frequency`) specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
-* `-ow` (or `--output-weight`) specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
+* `--output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
+* `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never)
+* `--process-output` specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
  
  For faster computation, make sure to use GPU offloading via the `-ngl` argument
  
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp

index e050c09d2f38b0ac173e2e898838a7e349250035..38420041c47817e1418579ce228bed7bff85d709 100644 (file)
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -17,39 +17,37 @@
  #pragma warning(disable: 4244 4267) // possible loss of data
  #endif
  
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
+    LOG_TEE("\nexample usage:\n");
+    LOG_TEE("\n    %s \\\n"
+            "       -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
+            "       [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
+            "       [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
+    LOG_TEE("\n");
+}
+
  struct Stats {
      std::vector<float> values;
      std::vector<int> counts;
      int ncall = 0;
  };
  
-struct StatParams {
-    std::string dataset;
-    std::string ofile = "imatrix.dat";
-    int         n_output_frequency = 10;
-    int         verbosity = 1;
-    int         keep_every = 0;
-    bool        collect_output_weight = false;
-};
-
  class IMatrixCollector {
  public:
      IMatrixCollector() = default;
-    void set_parameters(StatParams&& params) { m_params = std::move(params); }
+    void set_params(gpt_params params) { m_params = std::move(params); }
      bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
-    void save_imatrix() const;
-    bool load_imatrix(const char * file_name, bool add);
-    static bool load_imatrix(const char * file_name, std::unordered_map<std::string, Stats>& imatrix);
+    void save_imatrix(int ncall = -1) const;
+    bool load_imatrix(const char * file_name);
  private:
      std::unordered_map<std::string, Stats> m_stats;
-    StatParams                             m_params;
+    gpt_params                             m_params;
      std::mutex                             m_mutex;
      int                                    m_last_call = 0;
      std::vector<float>                     m_src1_data;
      std::vector<char>                      m_ids; // the expert ids from ggml_mul_mat_id
-                                                  //
-    void save_imatrix(const char * file_name, const char * dataset) const;
-    void keep_imatrix(int ncall) const;
  };
  
  // remove any prefix and suffixes from the name
@@ -85,7 +83,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
          if (t->op != GGML_OP_MUL_MAT) return false;
          // why are small batches ignored (<16 tokens)?
          if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
-        if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false;
+        if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == "output.weight"))) return false;
          return true;
      }
  
@@ -158,16 +156,16 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
              }
              if (e.ncall > m_last_call) {
                  m_last_call = e.ncall;
-                if (m_last_call % m_params.n_output_frequency == 0) {
+                if (m_last_call % m_params.n_out_freq == 0) {
                      save_imatrix();
                  }
-                if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
-                    keep_imatrix(m_last_call);
+                if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
+                    save_imatrix(m_last_call);
                  }
              }
          }
      } else {
-        auto& e = m_stats[wname];
+        auto & e = m_stats[wname];
          if (e.values.empty()) {
              e.values.resize(src1->ne[0], 0);
              e.counts.resize(src1->ne[0], 0);
@@ -189,11 +187,11 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
          }
          if (e.ncall > m_last_call) {
              m_last_call = e.ncall;
-            if (m_last_call % m_params.n_output_frequency == 0) {
+            if (m_last_call % m_params.n_out_freq == 0) {
                  save_imatrix();
              }
-            if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
-                keep_imatrix(m_last_call);
+            if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
+                save_imatrix(m_last_call);
              }
          }
      }
@@ -201,19 +199,17 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
      return true;
  }
  
-void IMatrixCollector::save_imatrix() const {
-    save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str());
-}
+void IMatrixCollector::save_imatrix(int ncall) const {
+    auto fname = m_params.out_file;
+    if (fname.empty()) {
+        fname = "imatrix.dat";
+    }
  
-void IMatrixCollector::keep_imatrix(int ncall) const {
-    auto file_name = m_params.ofile;
-    if (file_name.empty()) file_name = "imatrix.dat";
-    file_name += ".at_";
-    file_name += std::to_string(ncall);
-    save_imatrix(file_name.c_str(), m_params.dataset.c_str());
-}
+    if (ncall > 0) {
+        fname += ".at_";
+        fname += std::to_string(ncall);
+    }
  
-void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
      std::ofstream out(fname, std::ios::binary);
      int n_entries = m_stats.size();
      out.write((const char *) &n_entries, sizeof(n_entries));
@@ -236,26 +232,28 @@ void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) co
      // Write the number of call the matrix was computed with
      out.write((const char *) &m_last_call, sizeof(m_last_call));
  
-    // Write the dataset name at the end of the file to later on specify it in quantize
-    int n_dataset = strlen(dataset);
-    out.write((const char *) &n_dataset, sizeof(n_dataset));
-    out.write(dataset, n_dataset);
+    // Write the input filename at the end of the file to later on specify it in quantize
+    {
+        int len = m_params.prompt_file.size();
+        out.write((const char *) &len, sizeof(len));
+        out.write(m_params.prompt_file.c_str(), len);
+    }
  
      if (m_params.verbosity > 0) {
-        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname);
+        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
      }
  }
  
-bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_map<std::string, Stats>& imatrix_data) {
-    std::ifstream in(imatrix_file, std::ios::binary);
+bool IMatrixCollector::load_imatrix(const char * fname) {
+    std::ifstream in(fname, std::ios::binary);
      if (!in) {
-        printf("%s: failed to open %s\n",__func__,imatrix_file);
+        printf("%s: failed to open %s\n",__func__, fname);
          return false;
      }
      int n_entries;
      in.read((char*)&n_entries, sizeof(n_entries));
      if (in.fail() || n_entries < 1) {
-        printf("%s: no data in file %s\n", __func__, imatrix_file);
+        printf("%s: no data in file %s\n", __func__, fname);
          return false;
      }
      for (int i = 0; i < n_entries; ++i) {
@@ -263,23 +261,22 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
          std::vector<char> name_as_vec(len+1);
          in.read((char *)name_as_vec.data(), len);
          if (in.fail()) {
-            printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file);
+            printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
              return false;
          }
          name_as_vec[len] = 0;
          std::string name{name_as_vec.data()};
-        auto& e = imatrix_data[std::move(name)];
+        auto & e = m_stats[std::move(name)];
          int ncall;
          in.read((char*)&ncall, sizeof(ncall));
          int nval;
          in.read((char *)&nval, sizeof(nval));
          if (in.fail() || nval < 1) {
              printf("%s: failed reading number of values for entry %d\n",__func__,i);
-            imatrix_data = {};
+            m_stats = {};
              return false;
          }
  
-        // When re-called from load_imatrix() with add set, this will already be created.
          if (e.values.empty()) {
              e.values.resize(nval, 0);
              e.counts.resize(nval, 0);
@@ -289,7 +286,7 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
          in.read((char*)tmp.data(), nval*sizeof(float));
          if (in.fail()) {
              printf("%s: failed reading data for entry %d\n",__func__,i);
-            imatrix_data = {};
+            m_stats = {};
              return false;
          }
  
@@ -304,13 +301,6 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
      return true;
  }
  
-bool IMatrixCollector::load_imatrix(const char * file_name, bool add) {
-    if (!add) {
-        m_stats.clear();
-    }
-    return load_imatrix(file_name, m_stats);
-}
-
  static IMatrixCollector g_collector;
  
  static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
@@ -324,7 +314,7 @@ struct results_log_softmax {
      float  prob;
  };
  
-static std::vector<float> softmax(const std::vector<float>& logits) {
+static std::vector<float> softmax(const std::vector<float> & logits) {
      std::vector<float> probs(logits.size());
      float max_logit = logits[0];
      for (float v : logits) {
@@ -358,8 +348,7 @@ static results_log_softmax log_softmax(int n_vocab, const float * logits, int to
  
  static void process_logits(
      int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
-    double & nll, double & nll2, float * logit_history, float * prob_history
-) {
+    double & nll, double & nll2, float * logit_history, float * prob_history) {
      std::mutex mutex;
      int counter = 0;
      auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
@@ -391,8 +380,7 @@ static void process_logits(
      }
  }
  
-static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
-
+static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
      const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
      GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
      const int n_ctx = llama_n_ctx(ctx);
@@ -405,13 +393,13 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
      auto tim2 = std::chrono::high_resolution_clock::now();
      fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
  
-    if (from_chunk > 0) {
-        if (size_t((from_chunk + 2)*n_ctx) >= tokens.size()) {
-            fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, from_chunk);
+    if (params.i_chunk > 0) {
+        if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
+            fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
              return false;
          }
-        fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk*n_ctx);
-        tokens.erase(tokens.begin(), tokens.begin() + from_chunk*n_ctx);
+        fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
+        tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
      }
  
      if (int(tokens.size()) < 2*n_ctx) {
@@ -424,7 +412,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
      std::vector<float> logit_history;
      std::vector<float> prob_history;
  
-    if (compute_ppl) {
+    if (params.compute_ppl) {
          logit_history.resize(tokens.size());
          prob_history.resize(tokens.size());
      }
@@ -446,7 +434,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
      const int num_batches = (n_ctx + n_batch - 1) / n_batch;
  
      std::vector<float> logits;
-    if (compute_ppl && num_batches > 1) {
+    if (params.compute_ppl && num_batches > 1) {
          logits.reserve((size_t)n_ctx * n_vocab);
      }
  
@@ -482,7 +470,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
              // restore the original token in case it was set to BOS
              tokens[batch_start] = token_org;
  
-            if (compute_ppl && num_batches > 1) {
+            if (params.compute_ppl && num_batches > 1) {
                  const auto * batch_logits = llama_get_logits(ctx);
                  logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
              }
@@ -501,7 +489,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
              fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
          }
  
-        if (compute_ppl) {
+        if (params.compute_ppl) {
              const int first = n_ctx/2;
              const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
              process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
@@ -516,7 +504,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
      }
      printf("\n");
  
-    if (compute_ppl) {
+    if (params.compute_ppl) {
          nll2 /= count;
          nll /= count;
          const double ppl = exp(nll);
@@ -533,109 +521,32 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
  }
  
  int main(int argc, char ** argv) {
-    StatParams sparams;
-    std::string prev_result_file;
-    std::string combine_files;
-    bool compute_ppl = true;
-    int  from_chunk  = 0;
-    std::vector<char*> args;
-    args.push_back(argv[0]);
-    int iarg = 1;
-    for (; iarg < argc-1; ++iarg) {
-        std::string arg{argv[iarg]};
-        if (arg == "-o" || arg == "--output-file") {
-            sparams.ofile = argv[++iarg];
-        }
-        else if (arg == "-ofreq" || arg == "--output-frequency") {
-            sparams.n_output_frequency = std::stoi(argv[++iarg]);
-        }
-        else if (arg == "-ow" || arg == "--output-weight") {
-            sparams.collect_output_weight = std::stoi(argv[++iarg]);
-        }
-        else if (arg == "--verbosity") {
-            sparams.verbosity = std::stoi(argv[++iarg]);
-        } else if (arg == "--no-ppl") {
-            compute_ppl = false;
-        } else if (arg == "--keep-imatrix") {
-            sparams.keep_every = std::stoi(argv[++iarg]);
-        } else if (arg == "--continue-from") {
-            prev_result_file = argv[++iarg];
-        } else if (arg == "--combine") {
-            combine_files = argv[++iarg];
-        }
-        else if (arg == "--from-chunk") {
-            from_chunk = std::stoi(argv[++iarg]);
-        } else {
-            args.push_back(argv[iarg]);
-        }
-    }
-    if (iarg < argc) {
-        std::string arg{argv[iarg]};
-        if (arg == "--no-ppl") {
-            compute_ppl = false;
-        } else {
-            args.push_back(argv[iarg]);
-        }
-    }
-
      gpt_params params;
-    params.n_batch = 512;
+
+    params.n_ctx = 512;
+    params.logits_all = true;
+    params.verbosity = 1;
  
      if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+        print_usage(argc, argv, params);
          return 1;
      }
  
-    params.logits_all = true;
      params.n_batch = std::min(params.n_batch, params.n_ctx);
  
-    print_build_info();
-
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-
-    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-
-    sparams.dataset = params.prompt_file;
-    g_collector.set_parameters(std::move(sparams));
+    g_collector.set_params(params);
  
-    if (!combine_files.empty()) {
-        std::vector<std::string> files;
-        size_t pos = 0;
-        while (true) {
-            auto new_pos = combine_files.find(',', pos);
-            if (new_pos != std::string::npos) {
-                files.emplace_back(combine_files.substr(pos, new_pos - pos));
-                pos = new_pos + 1;
-            } else {
-                files.emplace_back(combine_files.substr(pos));
-                break;
-            }
-        }
-        if (files.size() < 2) {
-            fprintf(stderr, "You must provide at least two comma separated files to use --combine\n");
+    for (const auto & in_file : params.in_files) {
+        printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
+        if (!g_collector.load_imatrix(in_file.c_str())) {
+            fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str());
              return 1;
          }
-        printf("Combining the following %d files\n", int(files.size()));
-        for (auto& file : files) {
-            printf("    %s\n", file.c_str());
-            if (!g_collector.load_imatrix(file.c_str(), true)) {
-                fprintf(stderr, "Failed to load %s\n", file.c_str());
-                return 1;
-            }
-        }
-        g_collector.save_imatrix();
-        return 0;
      }
  
-    if (!prev_result_file.empty()) {
-        if (!g_collector.load_imatrix(prev_result_file.c_str(), false)) {
-            fprintf(stderr, "=============== Failed to load %s\n", prev_result_file.c_str());
-            return 1;
-        }
+    if (params.in_files.size() > 1) {
+        printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
+        g_collector.save_imatrix();
      }
  
      llama_backend_init();
@@ -650,6 +561,7 @@ int main(int argc, char ** argv) {
      // init
      llama_model * model;
      llama_context * ctx;
+
      std::tie(model, ctx) = llama_init_from_gpt_params(params);
      if (model == nullptr || ctx == nullptr) {
          fprintf(stderr, "%s : failed to init\n", __func__);
@@ -668,8 +580,7 @@ int main(int argc, char ** argv) {
          fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
      }
  
-    bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);
-    if (!OK) {
+    if (!compute_imatrix(ctx, params)) {
          return 1;
      }
  
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index d581cad95974d143d14fb1ce1fc884c44a1a0d66..74da81dadb49da92be0f3c5f467b7075955d5705 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2360,7 +2360,7 @@ int main(int argc, char ** argv) {
  
      // TODO: not great to use extern vars
      server_log_json = params.log_json;
-    server_verbose = params.verbose;
+    server_verbose = params.verbosity > 0;
  
      // struct that contains llama context and inference
      server_context ctx_server;
author	Georgi Gerganov <redacted>
	Thu, 6 Jun 2024 13:30:58 +0000 (16:30 +0300)
committer	GitHub <redacted>
	Thu, 6 Jun 2024 13:30:58 +0000 (16:30 +0300)
common/common.cpp		patch \| blob \| history
common/common.h		patch \| blob \| history
examples/imatrix/README.md		patch \| blob \| history
examples/imatrix/imatrix.cpp		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history