YAML result logging + preset script (#2657)

author Johannes Gäßler <redacted>

Mon, 28 Aug 2023 15:59:39 +0000 (17:59 +0200)

committer GitHub <redacted>

Mon, 28 Aug 2023 15:59:39 +0000 (17:59 +0200)
author Johannes Gäßler <redacted>
Mon, 28 Aug 2023 15:59:39 +0000 (17:59 +0200)
committer GitHub <redacted>
Mon, 28 Aug 2023 15:59:39 +0000 (17:59 +0200)
diff --git a/common/common.cpp b/common/common.cpp

index 0d91a6a35acaaaa99ef4562f08186e97cd31fd19..4a0d43c13ece9593a3900f2aa3ba9a4ef8484406 100644 (file)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1,15 +1,20 @@
  #include "common.h"
+#include "build-info.h"
+#include "llama.h"
  
+#include <algorithm>
  #include <cassert>
-#include <iostream>
+#include <cmath>
  #include <cstring>
+#include <ctime>
  #include <fstream>
-#include <string>
  #include <iterator>
-#include <algorithm>
+#include <iostream>
+#include <regex>
  #include <sstream>
+#include <string>
  #include <unordered_set>
-#include <regex>
+#include <vector>
  
  #if defined(__APPLE__) && defined(__MACH__)
  #include <sys/types.h>
@@ -19,11 +24,14 @@
  #if defined(_WIN32)
  #define WIN32_LEAN_AND_MEAN
  #define NOMINMAX
+#include <codecvt>
+#include <locale>
  #include <windows.h>
  #include <fcntl.h>
  #include <io.h>
  #else
  #include <sys/ioctl.h>
+#include <sys/stat.h>
  #include <unistd.h>
  #endif
  
@@ -93,7 +101,6 @@ void process_escapes(std::string& input) {
  
  bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
      bool invalid_param = false;
-    bool escape_prompt = false;
      std::string arg;
      gpt_params default_params;
      const std::string arg_prefix = "--";
@@ -125,8 +132,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                  break;
              }
              params.prompt = argv[i];
-        } else if (arg == "-e") {
-            escape_prompt = true;
+        } else if (arg == "-e" || arg == "--escape") {
+            params.escape = true;
          } else if (arg == "--prompt-cache") {
              if (++i >= argc) {
                  invalid_param = true;
@@ -415,6 +422,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                  break;
              }
              params.antiprompt.push_back(argv[i]);
+        } else if (arg == "-ld" || arg == "--logdir") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.logdir = argv[i];
+
+            if (params.logdir.back() != DIRECTORY_SEPARATOR) {
+                params.logdir += DIRECTORY_SEPARATOR;
+            }
          } else if (arg == "--perplexity") {
              params.perplexity = true;
          } else if (arg == "--ppl-stride") {
@@ -520,7 +537,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
          exit(1);
      }
  
-    if (escape_prompt) {
+    if (params.escape) {
          process_escapes(params.prompt);
          process_escapes(params.input_prefix);
          process_escapes(params.input_suffix);
@@ -546,7 +563,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
      fprintf(stdout, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
      fprintf(stdout, "  -p PROMPT, --prompt PROMPT\n");
      fprintf(stdout, "                        prompt to start generation with (default: empty)\n");
-    fprintf(stdout, "  -e                    process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
+    fprintf(stdout, "  -e, --escape          process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
      fprintf(stdout, "  --prompt-cache FNAME  file to cache prompt state for faster startup (default: none)\n");
      fprintf(stdout, "  --prompt-cache-all    if specified, saves user input and generations to cache as well.\n");
      fprintf(stdout, "                        not supported with --interactive or other interactive options\n");
@@ -627,6 +644,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
      fprintf(stdout, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
      fprintf(stdout, "  -m FNAME, --model FNAME\n");
      fprintf(stdout, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stdout, "  -ld LOGDIR, --logdir LOGDIR\n");
+    fprintf(stdout, "                        path under which to save YAML logs (no logging if unset)\n");
      fprintf(stdout, "\n");
  }
  
@@ -779,3 +798,289 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
  
      return result;
  }
+
+// returns true if successful, false otherwise
+bool create_directory_with_parents(const std::string & path) {
+#ifdef _WIN32
+    std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
+    std::wstring wpath = converter.from_bytes(path);
+
+    // if the path already exists, check whether it's a directory
+    const DWORD attributes = GetFileAttributesW(wpath.c_str());
+    if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
+        return true;
+    }
+
+    size_t pos_slash = 0;
+
+    // process path from front to back, procedurally creating directories
+    while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
+        const std::wstring subpath = wpath.substr(0, pos_slash);
+        const wchar_t * test = subpath.c_str();
+
+        const bool success = CreateDirectoryW(test, NULL);
+        if (!success) {
+            const DWORD error = GetLastError();
+
+            // if the path already exists, ensure that it's a directory
+            if (error == ERROR_ALREADY_EXISTS) {
+                const DWORD attributes = GetFileAttributesW(subpath.c_str());
+                if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
+                    return false;
+                }
+            } else {
+                return false;
+            }
+        }
+
+        pos_slash += 1;
+    }
+
+    return true;
+#else
+    // if the path already exists, check whether it's a directory
+    struct stat info;
+    if (stat(path.c_str(), &info) == 0) {
+        return S_ISDIR(info.st_mode);
+    }
+
+    size_t pos_slash = 1; // skip leading slashes for directory creation
+
+    // process path from front to back, procedurally creating directories
+    while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
+        const std::string subpath = path.substr(0, pos_slash);
+        struct stat info;
+
+        // if the path already exists, ensure that it's a directory
+        if (stat(subpath.c_str(), &info) == 0) {
+            if (!S_ISDIR(info.st_mode)) {
+                return false;
+            }
+        } else {
+            // create parent directories
+            const int ret = mkdir(subpath.c_str(), 0755);
+            if (ret != 0) {
+                return false;
+            }
+        }
+
+        pos_slash += 1;
+    }
+
+    return true;
+#endif // _WIN32
+}
+
+void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data) {
+    if (data.empty()) {
+        fprintf(stream, "%s:\n", prop_name);
+        return;
+    }
+
+    fprintf(stream, "%s: [", prop_name);
+    for (size_t i = 0; i < data.size() - 1; ++i) {
+        fprintf(stream, "%e, ", data[i]);
+    }
+    fprintf(stream, "%e]\n", data.back());
+}
+
+void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data) {
+    if (data.empty()) {
+        fprintf(stream, "%s:\n", prop_name);
+        return;
+    }
+
+    fprintf(stream, "%s: [", prop_name);
+    for (size_t i = 0; i < data.size() - 1; ++i) {
+        fprintf(stream, "%d, ", data[i]);
+    }
+    fprintf(stream, "%d]\n", data.back());
+}
+
+void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data) {
+    std::string data_str(data == NULL ? "" : data);
+
+    if (data_str.empty()) {
+        fprintf(stream, "%s:\n", prop_name);
+        return;
+    }
+
+    size_t pos_start = 0;
+    size_t pos_found = 0;
+
+    if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) {
+        data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
+        data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
+        data_str = "\"" + data_str + "\"";
+        fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
+        return;
+    }
+
+    if (data_str.find('\n') == std::string::npos) {
+        fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
+        return;
+    }
+
+    fprintf(stream, "%s: |\n", prop_name);
+    while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
+        fprintf(stream, "  %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
+        pos_start = pos_found + 1;
+    }
+}
+
+std::string get_sortable_timestamp() {
+    using clock = std::chrono::system_clock;
+
+    const clock::time_point current_time = clock::now();
+    const time_t as_time_t = clock::to_time_t(current_time);
+    char timestamp_no_ns[100];
+    std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t));
+
+    const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
+        current_time.time_since_epoch() % 1000000000).count();
+    char timestamp_ns[10];
+    snprintf(timestamp_ns, 11, "%09ld", ns);
+
+    return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
+}
+
+void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
+                               const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
+    fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
+    fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
+    fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
+    fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
+    fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
+    fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
+    fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
+    fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
+    fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
+    fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
+    fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
+    fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
+    fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
+    fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
+    fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
+    fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
+    fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
+    fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
+    fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
+    fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
+
+#ifdef NDEBUG
+    fprintf(stream, "debug: false\n");
+#else
+    fprintf(stream, "debug: true\n");
+#endif // NDEBUG
+
+    fprintf(stream, "model_desc: %s\n", model_desc);
+    fprintf(stream, "n_vocab: %d  # output size of the final layer, 32001 for some models\n", llama_n_vocab(lctx));
+
+#ifdef __OPTIMIZE__
+    fprintf(stream, "optimize: true\n");
+#else
+    fprintf(stream, "optimize: false\n");
+#endif // __OPTIMIZE__
+
+    fprintf(stream, "time: %s\n", timestamp.c_str());
+
+    fprintf(stream, "\n");
+    fprintf(stream, "###############\n");
+    fprintf(stream, "# User Inputs #\n");
+    fprintf(stream, "###############\n");
+    fprintf(stream, "\n");
+
+    fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
+    fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
+    dump_string_yaml_multiline(stream, "cfg_negative_prompt", params.cfg_negative_prompt.c_str());
+    fprintf(stream, "cfg_scale: %f # default: 1.0\n", params.cfg_scale);
+    fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
+    fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
+    fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
+    fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
+    fprintf(stream, "export: %s # default: false\n", params.export_cgraph ? "true" : "false");
+    fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
+    fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty);
+    dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
+    fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
+    fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
+    fprintf(stream, "hellaswag_tasks: %ld # default: 400\n", params.hellaswag_tasks);
+
+    const auto logit_bias_eos = params.logit_bias.find(llama_token_eos(lctx));
+    const bool ignore_eos = logit_bias_eos != params.logit_bias.end() && logit_bias_eos->second == -INFINITY;
+    fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
+
+    dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
+    fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
+    dump_string_yaml_multiline(stream, "in_suffix", params.input_prefix.c_str());
+    fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
+    fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
+    fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
+    fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
+    fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
+
+    fprintf(stream, "logit_bias:\n");
+    for (std::pair<llama_token, float> lb : params.logit_bias) {
+        if (ignore_eos && lb.first == logit_bias_eos->first) {
+            continue;
+        }
+        fprintf(stream, "  %d: %f", lb.first, lb.second);
+    }
+
+    fprintf(stream, "lora: %s\n", params.lora_adapter.c_str());
+    fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
+    fprintf(stream, "low_vram: %s # default: false\n", params.low_vram ? "true" : "false");
+    fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
+    fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
+    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat);
+    fprintf(stream, "mirostat_ent: %f # default: 5.0\n", params.mirostat_tau);
+    fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);
+    fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
+    fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
+    fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false");
+    fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
+    fprintf(stream, "n_gpu_layers: %d # default: 0\n", params.n_gpu_layers);
+    fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
+    fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs);
+    fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
+    fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
+    fprintf(stream, "no_penalize_nl: %s # default: false\n", !params.penalize_nl ? "true" : "false");
+    fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
+    fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
+    fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
+    fprintf(stream, "presence_penalty: %f # default: 0.0\n", params.presence_penalty);
+    dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
+    fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
+    fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
+    fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
+    dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
+    fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
+    fprintf(stream, "repeat_penalty: %f # default: 1.1\n", params.repeat_penalty);
+
+    fprintf(stream, "reverse_prompt:\n");
+    for (std::string ap : params.antiprompt) {
+        size_t pos = 0;
+        while ((pos = ap.find('\n', pos)) != std::string::npos) {
+            ap.replace(pos, 1, "\\n");
+            pos += 1;
+        }
+
+        fprintf(stream, "  - %s\n", ap.c_str());
+    }
+
+    fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
+    fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
+    fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
+    fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
+    fprintf(stream, "temp: %f # default: 0.8\n", params.temp);
+
+    const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
+    dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
+
+    fprintf(stream, "tfs: %f # default: 1.0\n", params.tfs_z);
+    fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
+    fprintf(stream, "top_k: %d # default: 40\n", params.top_k);
+    fprintf(stream, "top_p: %f # default: 0.95\n", params.top_p);
+    fprintf(stream, "typical_p: %f # default: 1.0\n", params.typical_p);
+    fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
+}
diff --git a/common/common.h b/common/common.h

index 97fda2be78b5188d717dbff1b3defc2eb8a9ab85..c15373144bc2a570c4b3d67c0f8427f063378a7f 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -11,6 +11,12 @@
  #include <unordered_map>
  #include <tuple>
  
+#ifdef _WIN32
+#define DIRECTORY_SEPARATOR '\\'
+#else
+#define DIRECTORY_SEPARATOR '/'
+#endif // _WIN32
+
  //
  // CLI argument parsing
  //
@@ -61,6 +67,7 @@ struct gpt_params {
      std::string input_suffix      = "";  // string to suffix user inputs with
      std::string grammar           = "";  // optional BNF-like grammar to constrain sampling
      std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
+    std::string logdir            = "";  // directory in which to save YAML log files
  
      std::string lora_adapter = "";  // lora adapter path
      std::string lora_base    = "";  // base model path for the lora adapter
@@ -82,6 +89,7 @@ struct gpt_params {
      bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it
  
      bool embedding         = false; // get only sentence embedding
+    bool escape            = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
      bool interactive_first = false; // wait for user input immediately
      bool multiline_input   = false; // reverse the usage of `\`
      bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
@@ -144,3 +152,13 @@ std::string llama_detokenize_spm(
  std::string llama_detokenize_bpe(
                           llama_context * ctx,
          const std::vector<llama_token> & tokens);
+
+bool create_directory_with_parents(const std::string & path);
+void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
+void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
+void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
+std::string get_sortable_timestamp();
+
+void dump_non_result_info_yaml(
+    FILE * stream, const gpt_params & params, const llama_context * lctx,
+    const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
diff --git a/examples/main/main.cpp b/examples/main/main.cpp

index 3ce57f436b89340f19aad550b1c94ebaa08cde70..89cc4f6023d1c6c4f7c3308be6566049866b6e56 100644 (file)
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -17,6 +17,7 @@
  #include <ctime>
  #include <fstream>
  #include <iostream>
+#include <sstream>
  #include <string>
  #include <vector>
  
@@ -36,9 +37,57 @@
  #pragma warning(disable: 4244 4267) // possible loss of data
  #endif
  
-static llama_context ** g_ctx;
+static llama_context           ** g_ctx;
+static llama_model             ** g_model;
+static gpt_params               * g_params;
+static std::vector<llama_token> * g_input_tokens;
+static std::ostringstream       * g_output_ss;
+static std::vector<llama_token> * g_output_tokens;
  static bool is_interacting = false;
  
+void write_logfile(
+    const llama_context * ctx, const gpt_params & params, const llama_model * model,
+    const std::vector<llama_token> input_tokens, const std::string output, const std::vector<llama_token> output_tokens) {
+
+    if (params.logdir.empty()) {
+        return;
+    }
+
+    const std::string timestamp = get_sortable_timestamp();
+
+    const bool success = create_directory_with_parents(params.logdir);
+    if (!success) {
+        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
+                __func__, params.logdir.c_str());
+        return;
+    }
+
+    const std::string logfile_path = params.logdir + timestamp + ".yml";
+    FILE * logfile = fopen(logfile_path.c_str(), "w");
+
+    if (logfile == NULL) {
+        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
+        return;
+    }
+
+    fprintf(logfile, "binary: main\n");
+    char model_desc[128];
+    llama_model_desc(model, model_desc, sizeof(model_desc));
+    dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
+
+    fprintf(logfile, "\n");
+    fprintf(logfile, "######################\n");
+    fprintf(logfile, "# Generation Results #\n");
+    fprintf(logfile, "######################\n");
+    fprintf(logfile, "\n");
+
+    dump_string_yaml_multiline(logfile, "output", output.c_str());
+    dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
+
+    llama_dump_timing_info_yaml(logfile, ctx);
+    fclose(logfile);
+}
+
  #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
  void sigint_handler(int signo) {
      if (signo == SIGINT) {
@@ -48,6 +97,7 @@ void sigint_handler(int signo) {
              console::cleanup();
              printf("\n");
              llama_print_timings(*g_ctx);
+            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
              _exit(130);
          }
      }
@@ -56,6 +106,7 @@ void sigint_handler(int signo) {
  
  int main(int argc, char ** argv) {
      gpt_params params;
+    g_params = &params;
  
      if (gpt_params_parse(argc, argv, params) == false) {
          return 1;
@@ -116,6 +167,7 @@ int main(int argc, char ** argv) {
      llama_model * model;
      llama_context * ctx;
      llama_context * ctx_guidance = NULL;
+    g_model = &model;
      g_ctx = &ctx;
  
      // load the model and apply lora adapter, if any
@@ -397,6 +449,10 @@ int main(int argc, char ** argv) {
      int n_session_consumed = 0;
      int n_past_guidance    = 0;
  
+    std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
+    std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
+    std::ostringstream output_ss;     g_output_ss     = &output_ss;
+
      // the first thing we will do is to output the prompt, so set color accordingly
      console::set_display(console::prompt);
  
@@ -667,7 +723,15 @@ int main(int argc, char ** argv) {
          // display text
          if (input_echo) {
              for (auto id : embd) {
-                printf("%s", llama_token_to_piece(ctx, id).c_str());
+                const std::string token_str = llama_token_to_piece(ctx, id);
+                printf("%s", token_str.c_str());
+
+                if (embd.size() > 1) {
+                    input_tokens.push_back(id);
+                } else {
+                    output_tokens.push_back(id);
+                    output_ss << token_str;
+                }
              }
              fflush(stdout);
          }
@@ -761,6 +825,8 @@ int main(int argc, char ** argv) {
                          printf("%s", params.input_suffix.c_str());
                      }
  
+                    const size_t original_size = embd_inp.size();
+
                      // instruct mode: insert instruction prefix
                      if (params.instruct && !is_antiprompt) {
                          n_consumed = embd_inp.size();
@@ -775,6 +841,12 @@ int main(int argc, char ** argv) {
                          embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
                      }
  
+                    for (size_t i = original_size; i < embd_inp.size(); ++i) {
+                        const llama_token token = embd_inp[i];
+                        output_tokens.push_back(token);
+                        output_ss << llama_token_to_piece(ctx, token);
+                    }
+
                      n_remain -= line_inp.size();
                  }
  
@@ -817,6 +889,8 @@ int main(int argc, char ** argv) {
      }
  
      llama_print_timings(ctx);
+    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
+
      if (ctx_guidance) { llama_free(ctx_guidance); }
      llama_free(ctx);
      llama_free_model(model);
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp

index ebafa0c29f54003e539a8778450bd47f2588ffb9..aeb774c5fa496037ea14a394b6587781e1b53864 100644 (file)
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -3,16 +3,79 @@
  #include "build-info.h"
  
  #include <cmath>
+#include <cstdio>
+#include <cstring>
  #include <ctime>
  #include <sstream>
-#include <cstring>
  #include <thread>
  #include <mutex>
+#include <vector>
  
  #if defined(_MSC_VER)
  #pragma warning(disable: 4244 4267) // possible loss of data
  #endif
  
+struct results_perplexity {
+    std::vector<llama_token> tokens;
+    double                   ppl_value;
+    std::vector<float>       logits;
+    std::vector<float>       probs;
+};
+
+struct results_log_softmax {
+    double log_softmax;
+    float  logit;
+    float  prob;
+};
+
+void write_logfile(const llama_context * ctx, const gpt_params & params,
+                   const llama_model * model, const struct results_perplexity & results) {
+
+    if (params.logdir.empty()) {
+        return;
+    }
+
+    if (params.hellaswag) {
+        fprintf(stderr, "%s: warning: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
+        return;
+    }
+
+    const std::string timestamp = get_sortable_timestamp();
+
+    const bool success = create_directory_with_parents(params.logdir);
+    if (!success) {
+        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
+                __func__, params.logdir.c_str());
+        return;
+    }
+
+    const std::string logfile_path = params.logdir + timestamp + ".yml";
+    FILE * logfile = fopen(logfile_path.c_str(), "w");
+
+    if (logfile == NULL) {
+        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
+        return;
+    }
+
+    fprintf(logfile, "binary: main\n");
+    char model_desc[128];
+    llama_model_desc(model, model_desc, sizeof(model_desc));
+    dump_non_result_info_yaml(logfile, params, ctx, timestamp, results.tokens, model_desc);
+
+    fprintf(logfile, "\n");
+    fprintf(logfile, "######################\n");
+    fprintf(logfile, "# Perplexity Results #\n");
+    fprintf(logfile, "######################\n");
+    fprintf(logfile, "\n");
+
+    dump_vector_float_yaml(logfile, "logits", results.logits);
+    fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
+    dump_vector_float_yaml(logfile, "probs", results.probs);
+
+    llama_dump_timing_info_yaml(logfile, ctx);
+    fclose(logfile);
+}
+
  std::vector<float> softmax(const std::vector<float>& logits) {
      std::vector<float> probs(logits.size());
      float max_logit = logits[0];
@@ -29,20 +92,20 @@ std::vector<float> softmax(const std::vector<float>& logits) {
      return probs;
  }
  
-float log_softmax(int n_vocab, const float * logits, int tok) {
+results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
      float max_logit = logits[0];
      for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]);
      double sum_exp = 0.0;
      for (int i = 0; i < n_vocab; ++i) sum_exp += expf(logits[i] - max_logit);
-    return logits[tok] - max_logit - log(sum_exp);
+    return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
  }
  
-void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread>& workers,
-        double& nll, double& nll2) {
+void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
+        double & nll, double & nll2, float * logit_history, float * prob_history) {
  
      std::mutex mutex;
      int counter = 0;
-    auto compute = [&mutex, &counter, &nll, &nll2, n_vocab, logits, tokens, n_token] () {
+    auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
          double local_nll = 0, local_nll2 = 0;
          while (true) {
              std::unique_lock<std::mutex> lock(mutex);
@@ -52,34 +115,43 @@ void process_logits(int n_vocab, const float * logits, const int * tokens, int n
                  break;
              }
              lock.unlock();
-            double v = -log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
+            const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
+            const double v = -results.log_softmax;
              local_nll += v;
              local_nll2 += v*v;
+
+            logit_history[i] = results.logit;
+            prob_history[i]  = results.prob;
          }
      };
-    for (auto& w : workers) w = std::thread(compute);
+    for (auto & w : workers) w = std::thread(compute);
      compute();
-    for (auto& w : workers) w.join();
+    for (auto & w : workers) w.join();
  
  }
  
-void perplexity_v2(llama_context * ctx, const gpt_params & params) {
+results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
      // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
      // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
      // Output: `perplexity: 13.5106 [114/114]`
      // BOS tokens will be added for each chunk before eval
  
-    if (params.ppl_stride <= 0) {
-        fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
-        return;
-    }
-
      const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
      const bool add_bos = is_spm;
  
      fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
  
-    auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
+    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
+    std::vector<float>       logit_history;
+    std::vector<float>       prob_history;
+
+    logit_history.resize(tokens.size());
+    prob_history.resize(tokens.size());
+
+    if (params.ppl_stride <= 0) {
+        fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
+        return {tokens, -1, logit_history, prob_history};
+    }
  
      const int calc_chunk = params.n_ctx;
  
@@ -88,7 +160,7 @@ void perplexity_v2(llama_context * ctx, const gpt_params & params) {
      if (int(tokens.size()) <= calc_chunk) {
          fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
                  tokens.size(), params.n_ctx, params.ppl_stride);
-        return;
+        return {tokens, -1, logit_history, prob_history};
      }
  
      const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1)  / params.ppl_stride;
@@ -120,7 +192,7 @@ void perplexity_v2(llama_context * ctx, const gpt_params & params) {
              //fprintf(stderr, "    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
              if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
                  //fprintf(stderr, "%s : failed to eval\n", __func__);
-                return;
+                return {tokens, -1, logit_history, prob_history};
              }
  
              // save original token and restore it after eval
@@ -161,6 +233,8 @@ void perplexity_v2(llama_context * ctx, const gpt_params & params) {
                  logits.begin() + (j + 1) * n_vocab);
  
              const float prob = softmax(tok_logits)[tokens[start + j + 1]];
+            logit_history[start + j + 1] = tok_logits[tokens[start + j + 1]];
+            prob_history[start + j + 1]  = prob;
  
              nll += -std::log(prob);
              ++count;
@@ -174,12 +248,14 @@ void perplexity_v2(llama_context * ctx, const gpt_params & params) {
          fflush(stdout);
      }
      printf("\n");
+
+    return {tokens, std::exp(nll / count), logit_history, prob_history};
  }
  
-void perplexity(llama_context * ctx, const gpt_params & params) {
+results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
+
      if (params.ppl_stride > 0) {
-        perplexity_v2(ctx, params);
-        return;
+        return perplexity_v2(ctx, params);
      }
  
      // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
@@ -193,11 +269,17 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
      auto tim1 = std::chrono::high_resolution_clock::now();
      fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
  
-    auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
+    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
  
      auto tim2 = std::chrono::high_resolution_clock::now();
      fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
  
+    std::vector<float> logit_history;
+    logit_history.resize(tokens.size());
+
+    std::vector<float> prob_history;
+    prob_history.resize(tokens.size());
+
      const int n_chunk_max = tokens.size() / params.n_ctx;
  
      const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
@@ -236,7 +318,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
  
              if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
                  fprintf(stderr, "%s : failed to eval\n", __func__);
-                return;
+                return {tokens, -1, logit_history, prob_history};
              }
  
              // restore the original token in case it was set to BOS
@@ -272,7 +354,8 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
          // last 256 tokens.  Then, we split the input up into context window size chunks to
          // process the entire prompt.
          const int first = std::min(512, params.n_ctx/2);
-        process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first, workers, nll, nll2);
+        process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first,
+                       workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
          count += params.n_ctx - first - 1;
  
          // perplexity is e^(average negative log-likelihood)
@@ -287,16 +370,19 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
          fflush(stdout);
      }
      printf("\n");
+
      nll2 /= count;
      nll /= count;
+    const double ppl = exp(nll);
      nll2 -= nll * nll;
      if (nll2 > 0) {
          nll2 = sqrt(nll2/(count-1));
-        double ppl = exp(nll);
          printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
      } else {
          printf("Unexpected negative standard deviation of log(prob)\n");
      }
+
+    return {tokens, ppl, logit_history, prob_history};
  }
  
  std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
@@ -604,13 +690,16 @@ int main(int argc, char ** argv) {
                  params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
      }
  
+    struct results_perplexity results;
      if (params.hellaswag) {
          hellaswag_score(ctx, params);
      } else {
-        perplexity(ctx, params);
+        results = perplexity(ctx, params);
      }
  
      llama_print_timings(ctx);
+    write_logfile(ctx, params, model, results);
+
      llama_free(ctx);
      llama_free_model(model);
  
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index 89a3311f5432977639b90af608bdb58a4e287bf6..b485a5eada9f4bec9ce3a0f08c0c575b24d6a322 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -719,7 +719,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
      fprintf(stdout, "  -ts SPLIT --tensor-split SPLIT\n");
      fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
      fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
-    fprintf(stdout, "  -lv, --low-vram don't allocate VRAM scratch buffer\n");
+    fprintf(stdout, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n");
      fprintf(stdout, "  -nommq, --no-mul-mat-q\n");
      fprintf(stdout, "                        use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
      fprintf(stdout, "                        Not recommended since this is both slower and uses more VRAM.\n");
diff --git a/llama.cpp b/llama.cpp

index da8ff64d0a0c0f3fc397abe44dff5625c75a96da..11697ee65c2a2f15e25bdcc48ea833d14fbdfdd4 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -6247,6 +6247,35 @@ const char * llama_print_system_info(void) {
      return s.c_str();
  }
  
+void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
+
+    fprintf(stream, "\n");
+    fprintf(stream, "###########\n");
+    fprintf(stream, "# Timings #\n");
+    fprintf(stream, "###########\n");
+    fprintf(stream, "\n");
+
+    fprintf(stream, "mst_eval: %.2f  # ms / token during generation\n",
+            1.0e-3 * ctx->t_eval_us / ctx->n_eval);
+    fprintf(stream, "mst_p_eval: %.2f  # ms / token during prompt processing\n",
+            1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
+    fprintf(stream, "mst_sample: %.2f  # ms / token during sampling\n",
+            1.0e-3 * ctx->t_sample_us / ctx->n_sample);
+    fprintf(stream, "n_eval: %d  # number of tokens generated (excluding the first one)\n", ctx->n_eval);
+    fprintf(stream, "n_p_eval: %d  # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
+    fprintf(stream, "n_sample: %d  # number of sampled tokens\n", ctx->n_sample);
+    fprintf(stream, "t_eval_us: %ld  # total microseconds spent generating tokens\n", ctx->t_eval_us);
+    fprintf(stream, "t_load_us: %ld  # total microseconds spent loading the model\n", ctx->t_load_us);
+    fprintf(stream, "t_p_eval_us: %ld  # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
+    fprintf(stream, "t_sample_us: %ld  # total microseconds spent sampling\n", ctx->t_sample_us);
+    fprintf(stream, "ts_eval: %.2f  # tokens / second during generation\n",
+            1.0e6 * ctx->n_eval / ctx->t_eval_us);
+    fprintf(stream, "ts_p_eval: %.2f  # tokens / second during prompt processing\n",
+            1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
+    fprintf(stream, "ts_sample: %.2f  # tokens / second during sampling\n",
+            1.0e6 * ctx->n_sample / ctx->t_sample_us);
+}
+
  // For internal test use
  const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
      return ctx->model.tensors_by_name;
diff --git a/llama.h b/llama.h

index 7bb681d612cc5a3bf641a45012828feee3f0d3da..b38d3be206115eb7c46db3f0885f0f5e6a90f52e 100644 (file)
--- a/llama.h
+++ b/llama.h
@@ -10,6 +10,7 @@
  #endif // GGML_USE_CUBLAS
  #include <stddef.h>
  #include <stdint.h>
+#include <stdio.h>
  #include <stdbool.h>
  
  #ifdef LLAMA_SHARED
@@ -520,6 +521,8 @@ extern "C" {
      // If this is not called, or NULL is supplied, everything is output on stderr.
      LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
  
+    LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx);
+
  #ifdef __cplusplus
  }
  #endif
diff --git a/run_with_preset.py b/run_with_preset.py

new file mode 100755 (executable)

index 0000000..8f90f52
--- /dev/null
+++ b/run_with_preset.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+import subprocess
+import sys
+
+import yaml
+
+CLI_ARGS_MAIN_PERPLEXITY = [
+    "batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape",
+    "export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
+    "hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "instruct",
+    "interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base",
+    "low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
+    "model", "mtest", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q",
+    "np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt",
+    "prompt-cache", "prompt-cache-all", "prompt-cache-ro", "random-prompt", "repeat-last-n",
+    "repeat-penalty", "reverse-prompt", "rope-freq-base", "rope-freq-scale", "rope-scale", "seed",
+    "simple-io", "tensor-split", "threads", "temp", "tfs", "top-k", "top-p", "typical",
+    "verbose-prompt"
+]
+
+CLI_ARGS_LLAMA_BENCH = [
+    "batch-size", "memory-f32", "low-vram", "model", "mul-mat-q", "n-gen", "n-gpu-layers",
+    "n-prompt", "output", "repetitions", "tensor-split", "threads", "verbose"
+]
+
+CLI_ARGS_SERVER = [
+    "alias", "batch-size", "ctx-size", "embedding", "host", "memory-f32", "lora", "lora-base",
+    "low-vram", "main-gpu", "mlock", "model", "n-gpu-layers", "n-probs", "no-mmap", "no-mul-mat-q",
+    "numa", "path", "port", "rope-freq-base", "timeout", "rope-freq-scale", "tensor-split",
+    "threads", "verbose"
+]
+
+description = """Run llama.cpp binaries with presets from YAML file(s).
+To specify which binary should be run, specify the "binary" property (main, perplexity, llama-bench, and server are supported).
+To get a preset file template, run a llama.cpp binary with the "--logdir" CLI argument.
+
+Formatting considerations:
+- The YAML property names are the same as the CLI argument names of the corresponding binary.
+- Properties must use the long name of their corresponding llama.cpp CLI arguments.
+- Like the llama.cpp binaries the property names do not differentiate between hyphens and underscores.
+- Flags must be defined as "<PROPERTY_NAME>: true" to be effective.
+- To define the logit_bias property, the expected format is "<TOKEN_ID>: <BIAS>" in the "logit_bias" namespace.
+- To define multiple "reverse_prompt" properties simultaneously the expected format is a list of strings.
+- To define a tensor split, pass a list of floats.
+"""
+usage = "run_with_preset.py [-h] [yaml_files ...] [--<ARG_NAME> <ARG_VALUE> ...]"
+epilog = ("  --<ARG_NAME> specify additional CLI ars to be passed to the binary (override all preset files). "
+          "Unknown args will be ignored.")
+
+parser = argparse.ArgumentParser(
+    description=description, usage=usage, epilog=epilog, formatter_class=argparse.RawTextHelpFormatter)
+parser.add_argument("-bin", "--binary", help="The binary to run.")
+parser.add_argument("yaml_files", nargs="*",
+                    help="Arbitrary number of YAML files from which to read preset values. "
+                    "If two files specify the same values the later one will be used.")
+
+known_args, unknown_args = parser.parse_known_args()
+
+if not known_args.yaml_files and not unknown_args:
+    parser.print_help()
+    sys.exit(0)
+
+props = dict()
+
+for yaml_file in known_args.yaml_files:
+    with open(yaml_file, "r") as f:
+        props.update(yaml.load(f, yaml.SafeLoader))
+
+props = {prop.replace("_", "-"): val for prop, val in props.items()}
+
+binary = props.pop("binary", "main")
+if known_args.binary:
+    binary = known_args.binary
+
+if os.path.exists(f"./{binary}"):
+    binary = f"./{binary}"
+
+if binary.lower().endswith("main") or binary.lower().endswith("perplexity"):
+    cli_args = CLI_ARGS_MAIN_PERPLEXITY
+elif binary.lower().endswith("llama-bench"):
+    cli_args = CLI_ARGS_LLAMA_BENCH
+elif binary.lower().endswith("server"):
+    cli_args = CLI_ARGS_SERVER
+else:
+    print(f"Unknown binary: {binary}")
+    sys.exit(1)
+
+command_list = [binary]
+
+for cli_arg in cli_args:
+    value = props.pop(cli_arg, None)
+
+    if not value or value == -1:
+        continue
+
+    if cli_arg == "logit-bias":
+        for token, bias in value.items():
+            command_list.append("--logit-bias")
+            command_list.append(f"{token}{bias:+}")
+        continue
+
+    if cli_arg == "reverse-prompt" and not isinstance(value, str):
+        for rp in value:
+            command_list.append("--reverse-prompt")
+            command_list.append(str(rp))
+        continue
+
+    command_list.append(f"--{cli_arg}")
+
+    if cli_arg == "tensor-split":
+        command_list.append(",".join([str(v) for v in value]))
+        continue
+
+    value = str(value)
+
+    if value != "True":
+        command_list.append(str(value))
+
+num_unused = len(props)
+if num_unused > 10:
+    print(f"The preset file contained a total of {num_unused} unused properties.")
+elif num_unused > 0:
+    print("The preset file contained the following unused properties:")
+    for prop, value in props.items():
+        print(f"  {prop}: {value}")
+
+command_list += unknown_args
+
+sp = subprocess.Popen(command_list)
+
+while sp.returncode is None:
+    try:
+        sp.wait()
+    except KeyboardInterrupt:
+        pass
+
+sys.exit(sp.returncode)
author	Johannes Gäßler <redacted>
	Mon, 28 Aug 2023 15:59:39 +0000 (17:59 +0200)
committer	GitHub <redacted>
	Mon, 28 Aug 2023 15:59:39 +0000 (17:59 +0200)
common/common.cpp		patch \| blob \| history
common/common.h		patch \| blob \| history
examples/main/main.cpp		patch \| blob \| history
examples/perplexity/perplexity.cpp		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history
llama.cpp		patch \| blob \| history
llama.h		patch \| blob \| history
run_with_preset.py	[new file with mode: 0755]	patch \| blob