common : reimplement logging (#9418)

author Georgi Gerganov <redacted>

Sun, 15 Sep 2024 17:46:12 +0000 (20:46 +0300)

committer GitHub <redacted>

Sun, 15 Sep 2024 17:46:12 +0000 (20:46 +0300)
author Georgi Gerganov <redacted>
Sun, 15 Sep 2024 17:46:12 +0000 (20:46 +0300)
committer GitHub <redacted>
Sun, 15 Sep 2024 17:46:12 +0000 (20:46 +0300)
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml

index 181ef37e2c94a032dbb7fe304e8fe62bdd45849a..1777489ec8a56721b736460f1a153fcf3b1c0d83 100644 (file)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -23,6 +23,9 @@ env:
    BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
    GGML_NLOOP: 3
    GGML_N_THREADS: 1
+  LLAMA_LOG_COLORS: 1
+  LLAMA_LOG_PREFIX: 1
+  LLAMA_LOG_TIMESTAMPS: 1
  
  jobs:
    macOS-latest-cmake-arm64:
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml

index 29f8fd444311952f84e90791e216a560511d8643..699ac095d6c8320dc89e4037c66f366b0561442a 100644 (file)
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -20,6 +20,12 @@ on:
      types: [opened, synchronize, reopened]
      paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
  
+env:
+  LLAMA_LOG_COLORS: 1
+  LLAMA_LOG_PREFIX: 1
+  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_LOG_VERBOSITY: 10
+
  concurrency:
    group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
    cancel-in-progress: true
diff --git a/Makefile b/Makefile

index f41887a4d3d8c5247329a59f505b719ee7f59518..cb5ff9f9dc9af71fedf7e5b3383fd9cd3fa65c34 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -54,6 +54,7 @@ TEST_TARGETS = \
         tests/test-grammar-parser \
         tests/test-json-schema-to-grammar \
         tests/test-llama-grammar \
+       tests/test-log \
         tests/test-model-load-cancel \
         tests/test-opt \
         tests/test-quantize-fns \
@@ -148,6 +149,14 @@ GGML_NO_METAL := 1
  DEPRECATE_WARNING := 1
  endif
  
+ifdef LLAMA_DISABLE_LOGS
+REMOVE_WARNING := 1
+endif
+
+ifdef LLAMA_SERVER_VERBOSE
+REMOVE_WARNING := 1
+endif
+
  ifndef UNAME_S
  UNAME_S := $(shell uname -s)
  endif
@@ -351,19 +360,11 @@ ifdef LLAMA_SANITIZE_UNDEFINED
         MK_LDFLAGS  += -fsanitize=undefined -g
  endif
  
-ifdef LLAMA_SERVER_VERBOSE
-       MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
-endif
-
  ifdef LLAMA_SERVER_SSL
         MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
         MK_LDFLAGS += -lssl -lcrypto
  endif
  
-ifdef LLAMA_DISABLE_LOGS
-       MK_CPPFLAGS += -DLOG_DISABLE_LOGS
-endif # LLAMA_DISABLE_LOGS
-
  # warnings
  WARN_FLAGS = \
         -Wall \
@@ -931,6 +932,7 @@ OBJ_LLAMA = \
  OBJ_COMMON = \
         common/common.o \
         common/arg.o \
+       common/log.o \
         common/console.o \
         common/ngram-cache.o \
         common/sampling.o \
@@ -1027,6 +1029,14 @@ $(info   - LLAMA_NO_CCACHE)
  $(info )
  endif
  
+ifdef REMOVE_WARNING
+$(info !!! REMOVAL WARNING !!!)
+$(info The following LLAMA_ options have been removed and are no longer supported)
+$(info   - LLAMA_DISABLE_LOGS   (https://github.com/ggerganov/llama.cpp/pull/9418))
+$(info   - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418))
+$(info )
+endif
+
  #
  # Build libraries
  #
@@ -1168,6 +1178,11 @@ common/arg.o: \
         common/arg.h
         $(CXX) $(CXXFLAGS) -c $< -o $@
  
+common/log.o: \
+       common/log.cpp \
+       common/log.h
+       $(CXX) $(CXXFLAGS) -c $< -o $@
+
  common/sampling.o: \
         common/sampling.cpp \
         common/sampling.h \
@@ -1346,7 +1361,7 @@ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
         $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
  
  llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
-       $(OBJ_GGML) $(OBJ_LLAMA)
+       $(OBJ_ALL)
         $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
         $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
  
@@ -1528,6 +1543,11 @@ tests/test-llama-grammar: tests/test-llama-grammar.cpp \
         $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
         $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
  
+tests/test-log: tests/test-log.cpp \
+       $(OBJ_ALL)
+       $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+       $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
  tests/test-grammar-parser: tests/test-grammar-parser.cpp \
         $(OBJ_ALL)
         $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
diff --git a/ci/run.sh b/ci/run.sh

index 751bb0a021dce00cd725d4bb752dc0cf8169ac14..1ac08ee4e19a8601c2117fdce598fdf6faeaa57f 100755 (executable)
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -737,6 +737,9 @@ function gg_sum_embd_bge_small {
  
  ## main
  
+export LLAMA_LOG_PREFIX=1
+export LLAMA_LOG_TIMESTAMPS=1
+
  if [ -z ${GG_BUILD_LOW_PERF} ]; then
      # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
      rm -rf ${SRC}/models-mnt
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt

index 22fd99689fab042cf74b90b0d098bd385865b732..042e895add5e2d260e153d5a083ab163f25d6f02 100644 (file)
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -51,21 +51,23 @@ endif()
  set(TARGET common)
  
  add_library(${TARGET} STATIC
+    arg.cpp
+    arg.h
      base64.hpp
-    common.h
      common.cpp
-    arg.h
-    arg.cpp
-    sampling.h
-    sampling.cpp
-    console.h
+    common.h
      console.cpp
-    json.hpp
+    console.h
      json-schema-to-grammar.cpp
-    train.h
-    train.cpp
-    ngram-cache.h
+    json.hpp
+    log.cpp
+    log.h
      ngram-cache.cpp
+    ngram-cache.h
+    sampling.cpp
+    sampling.h
+    train.cpp
+    train.h
      )
  
  if (BUILD_SHARED_LIBS)
diff --git a/common/arg.cpp b/common/arg.cpp

index a1cd5830f9303dc7d4d0a70e89465da0c970a047..8fcb8c25f862be9df566651cc330e48667e0fc85 100644 (file)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1,15 +1,17 @@
  #include "arg.h"
  
+#include "log.h"
  #include "sampling.h"
  
  #include <algorithm>
-#include <string>
-#include <vector>
-#include <set>
+#include <climits>
+#include <cstdarg>
  #include <fstream>
  #include <regex>
-#include <cstdarg>
-#include <climits>
+#include <set>
+#include <string>
+#include <thread>
+#include <vector>
  
  #include "json-schema-to-grammar.h"
  
@@ -383,20 +385,6 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
              exit(0);
          }
      ));
-    add_opt(llama_arg(
-        {"-v", "--verbose"},
-        "print verbose information",
-        [](gpt_params & params) {
-            params.verbosity = 1;
-        }
-    ));
-    add_opt(llama_arg(
-        {"--verbosity"}, "N",
-        format("set specific verbosity level (default: %d)", params.verbosity),
-        [](gpt_params & params, int value) {
-            params.verbosity = value;
-        }
-    ));
      add_opt(llama_arg(
          {"--verbose-prompt"},
          format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
@@ -417,7 +405,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
          [](gpt_params & params) {
              params.use_color = true;
          }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
      add_opt(llama_arg(
          {"-t", "--threads"}, "N",
          format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
@@ -876,7 +864,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
              params.input_prefix = value;
              params.enable_chat_template = false;
          }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
      add_opt(llama_arg(
          {"--in-suffix"}, "STRING",
          "string to suffix after user inputs with (default: empty)",
@@ -884,7 +872,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
              params.input_suffix = value;
              params.enable_chat_template = false;
          }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
      add_opt(llama_arg(
          {"--no-warmup"},
          "skip warming up the model with an empty run",
@@ -1824,19 +1812,6 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
              params.system_prompt = system_prompt;
          }
      ).set_examples({LLAMA_EXAMPLE_SERVER}));
-    add_opt(llama_arg(
-        {"--log-format"}, "{text, json}",
-        "log output format: json or text (default: json)",
-        [](gpt_params & params, const std::string & value) {
-            if (value == "json") {
-                params.log_json = true;
-            } else if (value == "text") {
-                params.log_json = false;
-            } else {
-                throw std::invalid_argument("invalid value");
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
      add_opt(llama_arg(
          {"--metrics"},
          format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
@@ -1956,39 +1931,57 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
              else { std::invalid_argument("invalid value"); }
          }
      ).set_examples({LLAMA_EXAMPLE_BENCH}));
-#ifndef LOG_DISABLE_LOGS
-    // TODO: make this looks less weird
-    add_opt(llama_arg(
-        {"--log-test"},
-        "Log test",
-        [](gpt_params &) { log_param_single_parse("--log-test"); }
-    ));
      add_opt(llama_arg(
          {"--log-disable"},
          "Log disable",
-        [](gpt_params &) { log_param_single_parse("--log-disable"); }
+        [](gpt_params &) {
+            gpt_log_pause(gpt_log_main());
+        }
      ));
      add_opt(llama_arg(
-        {"--log-enable"},
-        "Log enable",
-        [](gpt_params &) { log_param_single_parse("--log-enable"); }
+        {"--log-file"}, "FNAME",
+        "Log to file",
+        [](gpt_params &, const std::string & value) {
+            gpt_log_set_file(gpt_log_main(), value.c_str());
+        }
      ));
      add_opt(llama_arg(
-        {"--log-new"},
-        "Log new",
-        [](gpt_params &) { log_param_single_parse("--log-new"); }
-    ));
+        {"--log-colors"},
+        "Enable colored logging",
+        [](gpt_params &) {
+            gpt_log_set_colors(gpt_log_main(), true);
+        }
+    ).set_env("LLAMA_LOG_COLORS"));
      add_opt(llama_arg(
-        {"--log-append"},
-        "Log append",
-        [](gpt_params &) { log_param_single_parse("--log-append"); }
+        {"-v", "--verbose", "--log-verbose"},
+        "Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
+        [](gpt_params & params) {
+            params.verbosity = INT_MAX;
+            gpt_log_set_verbosity_thold(INT_MAX);
+        }
      ));
      add_opt(llama_arg(
-        {"--log-file"}, "FNAME",
-        "Log file",
-        [](gpt_params &, const std::string & value) { log_param_pair_parse(false, "--log-file", value); }
-    ));
-#endif // LOG_DISABLE_LOGS
+        {"-lv", "--verbosity", "--log-verbosity"}, "N",
+        "Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
+        [](gpt_params & params, int value) {
+            params.verbosity = value;
+            gpt_log_set_verbosity_thold(value);
+        }
+    ).set_env("LLAMA_LOG_VERBOSITY"));
+    add_opt(llama_arg(
+        {"--log-prefix"},
+        "Enable prefx in log messages",
+        [](gpt_params &) {
+            gpt_log_set_prefix(gpt_log_main(), true);
+        }
+    ).set_env("LLAMA_LOG_PREFIX"));
+    add_opt(llama_arg(
+        {"--log-timestamps"},
+        "Enable timestamps in log messages",
+        [](gpt_params &) {
+            gpt_log_set_timestamps(gpt_log_main(), true);
+        }
+    ).set_env("LLAMA_LOG_TIMESTAMPS"));
  
      return ctx_arg;
  }
diff --git a/common/common.cpp b/common/common.cpp

index f9a831ec72724cb1d93e3e1e821080bf9db231df..8d0ed4f95a7374f4cada4f101cf6458592a41e03 100644 (file)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -3,6 +3,7 @@
  #endif
  
  #include "common.h"
+#include "log.h"
  // Change JSON_ASSERT from assert() to GGML_ASSERT:
  #define JSON_ASSERT GGML_ASSERT
  #include "json.hpp"
@@ -25,6 +26,7 @@
  #include <unordered_map>
  #include <unordered_set>
  #include <vector>
+#include <thread>
  
  #if defined(__APPLE__) && defined(__MACH__)
  #include <sys/types.h>
@@ -48,7 +50,6 @@
  #if defined(LLAMA_USE_CURL)
  #include <curl/curl.h>
  #include <curl/easy.h>
-#include <thread>
  #include <future>
  #endif
  
@@ -226,7 +227,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
      }
  
      if (!SetPriorityClass(GetCurrentProcess(), p)) {
-        fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
+        LOG_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
          return false;
      }
  
@@ -251,7 +252,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
      }
  
      if (!setpriority(PRIO_PROCESS, 0, p)) {
-        fprintf(stderr, "warn: failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
+        LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
          return false;
      }
      return true;
@@ -284,14 +285,14 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model)
  
      if (n_set && n_set < cpuparams.n_threads) {
          // Not enough set bits, may experience performance issues.
-        fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
+        LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
      }
  }
  
  bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
      size_t dash_loc = range.find('-');
      if (dash_loc == std::string::npos) {
-        fprintf(stderr, "Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
+        LOG_ERR("Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
          return false;
      }
  
@@ -303,7 +304,7 @@ bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THRE
      } else {
          start_i = std::stoull(range.substr(0, dash_loc));
          if (start_i >= GGML_MAX_N_THREADS) {
-            fprintf(stderr, "Start index out of bounds!\n");
+            LOG_ERR("Start index out of bounds!\n");
              return false;
          }
      }
@@ -313,7 +314,7 @@ bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THRE
      } else {
          end_i = std::stoull(range.substr(dash_loc + 1));
          if (end_i >= GGML_MAX_N_THREADS) {
-            fprintf(stderr, "End index out of bounds!\n");
+            LOG_ERR("End index out of bounds!\n");
              return false;
          }
      }
@@ -348,7 +349,7 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
          } else if (c >= 'A' && c <= 'F') {
              id -= 'A' - 10;
          } else {
-            fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i));
+            LOG_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i));
              return false;
          }
  
@@ -361,6 +362,22 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
      return true;
  }
  
+void gpt_init() {
+    llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
+        if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) {
+            gpt_log_add(gpt_log_main(), level, "%s", text);
+        }
+    }, NULL);
+
+#ifdef NDEBUG
+    const char * build_type = "";
+#else
+    const char * build_type = " (debug)";
+#endif
+
+    LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
+}
+
  std::string gpt_params_get_system_info(const gpt_params & params) {
      std::ostringstream os;
  
@@ -441,6 +458,94 @@ void string_replace_all(std::string & s, const std::string & search, const std::
      s = std::move(builder);
  }
  
+std::string string_from(bool value) {
+    return value ? "true" : "false";
+}
+
+std::string string_from(const std::vector<int> & values) {
+    std::stringstream buf;
+
+    buf << "[ ";
+    bool first = true;
+    for (auto e : values) {
+        if (first) {
+            first = false;
+        } else {
+            buf << ", ";
+        }
+        buf << std::to_string(e);
+    }
+    buf << " ]";
+
+    return buf.str();
+}
+
+std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
+    std::stringstream buf;
+
+    buf << "[ ";
+
+    bool first = true;
+    for (const auto & token : tokens) {
+        if (!first) {
+            buf << ", ";
+        } else {
+            first = false;
+        }
+
+        auto detokenized = llama_token_to_piece(ctx, token);
+
+        detokenized.erase(
+            std::remove_if(
+                detokenized.begin(),
+                detokenized.end(),
+                [](const unsigned char c) { return !std::isprint(c); }),
+            detokenized.end());
+
+        buf << "'" << detokenized << "'"
+            << ":" << std::to_string(token);
+    }
+
+    buf << " ]";
+
+    return buf.str();
+}
+
+std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) {
+    std::stringstream buf;
+
+    buf << "[ ";
+
+    bool first = true;
+    for (int i = 0; i < batch.n_tokens; ++i) {
+        if (!first) {
+            buf << ", ";
+        } else {
+            first = false;
+        }
+
+        auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
+
+        detokenized.erase(
+                std::remove_if(
+                    detokenized.begin(),
+                    detokenized.end(),
+                    [](const unsigned char c) { return !std::isprint(c); }),
+                detokenized.end());
+
+        buf << "\n" << std::to_string(i)
+            << ":token '" << detokenized << "'"
+            << ":pos " << std::to_string(batch.pos[i])
+            << ":n_seq_id  " << std::to_string(batch.n_seq_id[i])
+            << ":seq_id " << std::to_string(batch.seq_id[i][0])
+            << ":logits " << std::to_string(batch.logits[i]);
+    }
+
+    buf << " ]";
+
+    return buf.str();
+}
+
  void string_process_escapes(std::string & input) {
      std::size_t input_len = input.length();
      std::size_t output_idx = 0;
@@ -481,7 +586,7 @@ void string_process_escapes(std::string & input) {
  bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
      const char * sep = strchr(data, '=');
      if (sep == nullptr || sep - data >= 128) {
-        fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
+        LOG_ERR("%s: malformed KV override '%s'\n", __func__, data);
          return false;
      }
      llama_model_kv_override kvo;
@@ -504,20 +609,20 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
          } else if (std::strcmp(sep, "false") == 0) {
              kvo.val_bool = false;
          } else {
-            fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
+            LOG_ERR("%s: invalid boolean value for KV override '%s'\n", __func__, data);
              return false;
          }
      } else if (strncmp(sep, "str:", 4) == 0) {
          sep += 4;
          kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
          if (strlen(sep) > 127) {
-            fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
+            LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
              return false;
          }
          strncpy(kvo.val_str, sep, 127);
          kvo.val_str[127] = '\0';
      } else {
-        fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
+        LOG_ERR("%s: invalid type for KV override '%s'\n", __func__, data);
          return false;
      }
      overrides.emplace_back(std::move(kvo));
@@ -729,7 +834,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
      }
  
      if (model == NULL) {
-        fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
          return iparams;
      }
  
@@ -737,7 +842,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
  
      llama_context * lctx = llama_new_context_with_model(model, cparams);
      if (lctx == NULL) {
-        fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
+        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
          llama_free_model(model);
          return iparams;
      }
@@ -773,7 +878,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
          loaded_la.scale = la.scale;
          loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
          if (loaded_la.adapter == nullptr) {
-            fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
+            LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
              llama_free(lctx);
              llama_free_model(model);
              return iparams;
@@ -785,12 +890,12 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
      }
  
      if (params.sparams.ignore_eos && llama_token_eos(model) == -1) {
-        fprintf(stderr, "%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
+        LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
          params.sparams.ignore_eos = false;
      }
  
      if (params.warmup) {
-        LOG("warming up the model with an empty run\n");
+        LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
  
          std::vector<llama_token> tmp;
          llama_token bos = llama_token_bos(model);
@@ -955,7 +1060,7 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
      int remaining_attempts = max_attempts;
  
      while (remaining_attempts > 0) {
-        fprintf(stderr, "%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
+        LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
  
          CURLcode res = curl_easy_perform(curl);
          if (res == CURLE_OK) {
@@ -963,13 +1068,14 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
          }
  
          int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
-        fprintf(stderr, "%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
+        LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
  
          remaining_attempts--;
          std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
      }
  
-    fprintf(stderr, "%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
+    LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
+
      return false;
  }
  
@@ -978,7 +1084,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
      // Initialize libcurl
      std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
      if (!curl) {
-        fprintf(stderr, "%s: error initializing libcurl\n", __func__);
+        LOG_ERR("%s: error initializing libcurl\n", __func__);
          return false;
      }
  
@@ -1019,11 +1125,11 @@ static bool llama_download_file(const std::string & url, const std::string & pat
          if (metadata_in.good()) {
              try {
                  metadata_in >> metadata;
-                fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
+                LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
                  if (metadata.contains("url") && metadata.at("url").is_string()) {
                      auto previous_url = metadata.at("url").get<std::string>();
                      if (previous_url != url) {
-                        fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
+                        LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
                          return false;
                      }
                  }
@@ -1034,12 +1140,12 @@ static bool llama_download_file(const std::string & url, const std::string & pat
                      last_modified = metadata.at("lastModified");
                  }
              } catch (const nlohmann::json::exception & e) {
-                fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
+            LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
                  return false;
              }
          }
      } else {
-        fprintf(stderr, "%s: no previous model file found %s\n", __func__, path.c_str());
+        LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
      }
  
      // Send a HEAD request to retrieve the etag and last-modified headers
@@ -1087,26 +1193,26 @@ static bool llama_download_file(const std::string & url, const std::string & pat
              // HEAD not supported, we don't know if the file has changed
              // force trigger downloading
              force_download = true;
-            fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
+            LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
          }
      }
  
      bool should_download = !file_exists || force_download;
      if (!should_download) {
          if (!etag.empty() && etag != headers.etag) {
-            fprintf(stderr, "%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
+            LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
              should_download = true;
          } else if (!last_modified.empty() && last_modified != headers.last_modified) {
-            fprintf(stderr, "%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
+            LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
              should_download = true;
          }
      }
      if (should_download) {
          std::string path_temporary = path + ".downloadInProgress";
          if (file_exists) {
-            fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
+            LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
              if (remove(path.c_str()) != 0) {
-                fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path.c_str());
+                LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
                  return false;
              }
          }
@@ -1121,7 +1227,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
  
          std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
          if (!outfile) {
-            fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str());
+            LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
              return false;
          }
  
@@ -1152,7 +1258,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
          };
  
          // start the download
-        fprintf(stderr, "%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
+        LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
              llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
          bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
          if (!was_perform_successful) {
@@ -1162,7 +1268,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
          long http_code = 0;
          curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
          if (http_code < 200 || http_code >= 400) {
-            fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code);
+            LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
              return false;
          }
  
@@ -1176,10 +1282,10 @@ static bool llama_download_file(const std::string & url, const std::string & pat
              {"lastModified", headers.last_modified}
          });
          std::ofstream(metadata_path) << metadata.dump(4);
-        fprintf(stderr, "%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
+        LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
  
          if (rename(path_temporary.c_str(), path.c_str()) != 0) {
-            fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
+            LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
              return false;
          }
      }
@@ -1194,7 +1300,7 @@ struct llama_model * llama_load_model_from_url(
          const struct llama_model_params & params) {
      // Basic validation of the model_url
      if (!model_url || strlen(model_url) == 0) {
-        fprintf(stderr, "%s: invalid model_url\n", __func__);
+        LOG_ERR("%s: invalid model_url\n", __func__);
          return NULL;
      }
  
@@ -1211,7 +1317,7 @@ struct llama_model * llama_load_model_from_url(
          };
          auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
          if (!ctx_gguf) {
-            fprintf(stderr, "\n%s:  failed to load input GGUF from %s\n", __func__, path_model);
+            LOG_ERR("\n%s:  failed to load input GGUF from %s\n", __func__, path_model);
              return NULL;
          }
  
@@ -1231,14 +1337,12 @@ struct llama_model * llama_load_model_from_url(
          // and extract split URL and PATH prefixes
          {
              if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
-                fprintf(stderr, "\n%s: unexpected model file name: %s"
-                                " n_split=%d\n", __func__, path_model, n_split);
+                LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
                  return NULL;
              }
  
              if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
-                fprintf(stderr, "\n%s: unexpected model url: %s"
-                                " n_split=%d\n", __func__, model_url, n_split);
+                LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
                  return NULL;
              }
          }
@@ -1298,7 +1402,7 @@ struct llama_model * llama_load_model_from_url(
          const char * /*path_model*/,
          const char * /*hf_token*/,
          const struct llama_model_params & /*params*/) {
-    fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
+    LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
      return nullptr;
  }
  
@@ -1308,7 +1412,7 @@ struct llama_model * llama_load_model_from_hf(
          const char * /*path_model*/,
          const char * /*hf_token*/,
          const struct llama_model_params & /*params*/) {
-    fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
+    LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
      return nullptr;
  }
  
@@ -1636,13 +1740,13 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
      };
      struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
      if (!ctx_gguf) {
-        fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
+        LOG_ERR("%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
          return result;
      }
  
      int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
      if (n_tensors == 0) {
-        fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
+        LOG_WRN("%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
      }
  
      for (int i = 0; i < n_tensors; i++) {
@@ -1660,23 +1764,23 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
              }
          }
          if (layer_idx < 0) {
-            fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
+            LOG_ERR("%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
              result.n_embd = -1;
              break;
          } else if (layer_idx == 0) {
-            fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
+            LOG_ERR("%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
              result.n_embd = -1;
              break;
          }
  
          struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
          if (tensor->type != GGML_TYPE_F32) {
-            fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
+            LOG_ERR("%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
              result.n_embd = -1;
              break;
          }
          if (ggml_n_dims(tensor) != 1) {
-            fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
+            LOG_ERR("%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
              result.n_embd = -1;
              break;
          }
@@ -1684,7 +1788,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
          if (result.n_embd == -1) {
              result.n_embd = ggml_nelements(tensor);
          } else if (ggml_nelements(tensor) != result.n_embd) {
-            fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
+            LOG_ERR("%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
              result.n_embd = -1;
              break;
          }
@@ -1701,7 +1805,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
      }
  
      if (result.n_embd == -1) {
-        fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
+        LOG_WRN("%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
          result.data.clear();
      }
  
@@ -1722,7 +1826,7 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
              break;
          }
          if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
-            fprintf(stderr, "%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
+            LOG_ERR("%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
              result.n_embd = -1;
              break;
          }
@@ -1738,7 +1842,7 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
      }
  
      if (result.n_embd == -1) {
-        fprintf(stderr, "%s: no valid control vector files passed\n", __func__);
+        LOG_ERR("%s: no valid control vector files passed\n", __func__);
          result.data.clear();
      }
  
diff --git a/common/common.h b/common/common.h

index e8025aeef57fa7997e5f102621457c01fc369134..e100c8fa73ecd81ed03be5a55ce6c36ce64f777c 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -4,11 +4,9 @@
  
  #include "llama.h"
  
-#define LOG_NO_FILE_LINE_FUNCTION
-#include "log.h"
-
  #include <string>
  #include <vector>
+#include <sstream>
  
  #ifdef _WIN32
  #define DIRECTORY_SEPARATOR '\\'
@@ -343,6 +341,10 @@ struct gpt_params {
      bool batched_bench_output_jsonl = false;
  };
  
+// call once at the start of a program if it uses libcommon
+// initializes the logging system and prints info about the build
+void gpt_init();
+
  std::string gpt_params_get_system_info(const gpt_params & params);
  
  bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
@@ -378,6 +380,11 @@ static std::vector<T> string_split(const std::string & str, char delim) {
  bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
  void string_process_escapes(std::string & input);
  
+std::string string_from(bool value);
+std::string string_from(const std::vector<int> & values);
+std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
+std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
+
  //
  // Filesystem utils
  //
diff --git a/common/log.cpp b/common/log.cpp

new file mode 100644 (file)

index 0000000..2825a22
--- /dev/null
+++ b/common/log.cpp
@@ -0,0 +1,401 @@
+#include "log.h"
+
+#include <condition_variable>
+#include <cstdarg>
+#include <cstdio>
+#include <mutex>
+#include <sstream>
+#include <thread>
+#include <vector>
+
+int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA;
+
+void gpt_log_set_verbosity_thold(int verbosity) {
+    gpt_log_verbosity_thold = verbosity;
+}
+
+#define LOG_COL_DEFAULT "\033[0m"
+#define LOG_COL_BOLD    "\033[1m"
+#define LOG_COL_RED     "\033[31m"
+#define LOG_COL_GREEN   "\033[32m"
+#define LOG_COL_YELLOW  "\033[33m"
+#define LOG_COL_BLUE    "\033[34m"
+#define LOG_COL_MAGENTA "\033[35m"
+#define LOG_COL_CYAN    "\033[36m"
+#define LOG_COL_WHITE   "\033[37m"
+
+static int64_t t_us() {
+    return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+}
+
+// colors
+enum gpt_log_col : int {
+    GPT_LOG_COL_DEFAULT = 0,
+    GPT_LOG_COL_BOLD,
+    GPT_LOG_COL_RED,
+    GPT_LOG_COL_GREEN,
+    GPT_LOG_COL_YELLOW,
+    GPT_LOG_COL_BLUE,
+    GPT_LOG_COL_MAGENTA,
+    GPT_LOG_COL_CYAN,
+    GPT_LOG_COL_WHITE,
+};
+
+// disable colors by default
+static std::vector<const char *> g_col = {
+    "",
+    "",
+    "",
+    "",
+    "",
+    "",
+    "",
+    "",
+    "",
+};
+
+struct gpt_log_entry {
+    enum ggml_log_level level;
+
+    bool prefix;
+
+    int64_t timestamp;
+
+    std::vector<char> msg;
+
+    // signals the worker thread to stop
+    bool is_end;
+
+    void print(FILE * file = nullptr) const {
+        FILE * fcur = file;
+        if (!fcur) {
+            // stderr displays DBG messages only when their verbosity level is not higher than the threshold
+            // these messages will still be logged to a file
+            if (level == GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
+                return;
+            }
+
+            fcur = stdout;
+
+            if (level != GGML_LOG_LEVEL_NONE) {
+                fcur = stderr;
+            }
+        }
+
+        if (level != GGML_LOG_LEVEL_NONE && prefix) {
+            if (timestamp) {
+                // [M.s.ms.us]
+                fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
+                        g_col[GPT_LOG_COL_BLUE],
+                        (int) (timestamp / 1000000 / 60),
+                        (int) (timestamp / 1000000 % 60),
+                        (int) (timestamp / 1000 % 1000),
+                        (int) (timestamp % 1000),
+                        g_col[GPT_LOG_COL_DEFAULT]);
+            }
+
+            switch (level) {
+                case GGML_LOG_LEVEL_INFO:  fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN],   g_col[GPT_LOG_COL_DEFAULT]); break;
+                case GGML_LOG_LEVEL_WARN:  fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], ""                        ); break;
+                case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED],     ""                        ); break;
+                case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW],  ""                        ); break;
+                default:
+                    break;
+            }
+        }
+
+        fprintf(fcur, "%s", msg.data());
+
+        if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) {
+            fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]);
+        }
+
+        fflush(fcur);
+    }
+};
+
+struct gpt_log {
+    // default capacity - will be expanded if needed
+    gpt_log() : gpt_log(256) {}
+
+    gpt_log(size_t capacity) {
+        file = nullptr;
+        prefix = false;
+        timestamps = false;
+        running = false;
+        t_start = t_us();
+
+        // initial message size - will be expanded if longer messages arrive
+        entries.resize(capacity);
+        for (auto & entry : entries) {
+            entry.msg.resize(256);
+        }
+
+        head = 0;
+        tail = 0;
+
+        resume();
+    }
+
+    ~gpt_log() {
+        pause();
+        if (file) {
+            fclose(file);
+        }
+    }
+
+private:
+    std::mutex mtx;
+    std::thread thrd;
+    std::condition_variable cv;
+
+    FILE * file;
+
+    bool prefix;
+    bool timestamps;
+    bool running;
+
+    int64_t t_start;
+
+    // ring buffer of entries
+    std::vector<gpt_log_entry> entries;
+    size_t head;
+    size_t tail;
+
+    // worker thread copies into this
+    gpt_log_entry cur;
+
+public:
+    void add(enum ggml_log_level level, const char * fmt, va_list args) {
+        std::lock_guard<std::mutex> lock(mtx);
+
+        if (!running) {
+            // discard messages while the worker thread is paused
+            return;
+        }
+
+        auto & entry = entries[tail];
+
+        {
+            // cannot use args twice, so make a copy in case we need to expand the buffer
+            va_list args_copy;
+            va_copy(args_copy, args);
+
+#if 1
+            const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args);
+            if (n >= entry.msg.size()) {
+                entry.msg.resize(n + 1);
+                vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args_copy);
+            }
+#else
+            // hack for bolding arguments
+
+            std::stringstream ss;
+            for (int i = 0; fmt[i] != 0; i++) {
+                if (fmt[i] == '%') {
+                    ss << LOG_COL_BOLD;
+                    while (fmt[i] != ' ' && fmt[i] != ')' && fmt[i] != ']' && fmt[i] != 0) ss << fmt[i++];
+                    ss << LOG_COL_DEFAULT;
+                    if (fmt[i] == 0) break;
+                }
+                ss << fmt[i];
+            }
+            const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args);
+            if (n >= entry.msg.size()) {
+                entry.msg.resize(n + 1);
+                vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
+            }
+#endif
+        }
+
+        entry.level = level;
+        entry.prefix = prefix;
+        entry.timestamp = 0;
+        if (timestamps) {
+            entry.timestamp = t_us() - t_start;
+        }
+        entry.is_end = false;
+
+        tail = (tail + 1) % entries.size();
+        if (tail == head) {
+            // expand the buffer
+            std::vector<gpt_log_entry> new_entries(2*entries.size());
+
+            size_t new_tail = 0;
+
+            do {
+                new_entries[new_tail] = std::move(entries[head]);
+
+                head     = (head     + 1) % entries.size();
+                new_tail = (new_tail + 1);
+            } while (head != tail);
+
+            head = 0;
+            tail = new_tail;
+
+            for (size_t i = tail; i < new_entries.size(); i++) {
+                new_entries[i].msg.resize(256);
+            }
+
+            entries = std::move(new_entries);
+        }
+
+        cv.notify_one();
+    }
+
+    void resume() {
+        std::lock_guard<std::mutex> lock(mtx);
+
+        if (running) {
+            return;
+        }
+
+        running = true;
+
+        thrd = std::thread([this]() {
+            while (true) {
+                {
+                    std::unique_lock<std::mutex> lock(mtx);
+                    cv.wait(lock, [this]() { return head != tail; });
+
+                    cur = entries[head];
+
+                    head = (head + 1) % entries.size();
+                }
+
+                if (cur.is_end) {
+                    break;
+                }
+
+                cur.print(); // stdout and stderr
+
+                if (file) {
+                    cur.print(file);
+                }
+            }
+        });
+    }
+
+    void pause() {
+        {
+            std::lock_guard<std::mutex> lock(mtx);
+
+            if (!running) {
+                return;
+            }
+
+            running = false;
+
+            // push an entry to signal the worker thread to stop
+            {
+                auto & entry = entries[tail];
+                entry.is_end = true;
+
+                tail = (tail + 1) % entries.size();
+            }
+
+            cv.notify_one();
+        }
+
+        thrd.join();
+    }
+
+    void set_file(const char * path) {
+        pause();
+
+        if (file) {
+            fclose(file);
+        }
+
+        if (path) {
+            file = fopen(path, "w");
+        } else {
+            file = nullptr;
+        }
+
+        resume();
+    }
+
+    void set_colors(bool colors) {
+        pause();
+
+        if (colors) {
+            g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
+            g_col[GPT_LOG_COL_BOLD]    = LOG_COL_BOLD;
+            g_col[GPT_LOG_COL_RED]     = LOG_COL_RED;
+            g_col[GPT_LOG_COL_GREEN]   = LOG_COL_GREEN;
+            g_col[GPT_LOG_COL_YELLOW]  = LOG_COL_YELLOW;
+            g_col[GPT_LOG_COL_BLUE]    = LOG_COL_BLUE;
+            g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
+            g_col[GPT_LOG_COL_CYAN]    = LOG_COL_CYAN;
+            g_col[GPT_LOG_COL_WHITE]   = LOG_COL_WHITE;
+        } else {
+            for (size_t i = 0; i < g_col.size(); i++) {
+                g_col[i] = "";
+            }
+        }
+
+        resume();
+    }
+
+    void set_prefix(bool prefix) {
+        std::lock_guard<std::mutex> lock(mtx);
+
+        this->prefix = prefix;
+    }
+
+    void set_timestamps(bool timestamps) {
+        std::lock_guard<std::mutex> lock(mtx);
+
+        this->timestamps = timestamps;
+    }
+};
+
+//
+// public API
+//
+
+struct gpt_log * gpt_log_init() {
+    return new gpt_log;
+}
+
+struct gpt_log * gpt_log_main() {
+    static struct gpt_log log;
+
+    return &log;
+}
+
+void gpt_log_pause(struct gpt_log * log) {
+    log->pause();
+}
+
+void gpt_log_resume(struct gpt_log * log) {
+    log->resume();
+}
+
+void gpt_log_free(struct gpt_log * log) {
+    delete log;
+}
+
+void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    log->add(level, fmt, args);
+    va_end(args);
+}
+
+void gpt_log_set_file(struct gpt_log * log, const char * file) {
+    log->set_file(file);
+}
+
+void gpt_log_set_colors(struct gpt_log * log, bool colors) {
+    log->set_colors(colors);
+}
+
+void gpt_log_set_prefix(struct gpt_log * log, bool prefix) {
+    log->set_prefix(prefix);
+}
+
+void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) {
+    log->set_timestamps(timestamps);
+}
diff --git a/common/log.h b/common/log.h

index 1bc5328ce3e119dd91e19e9c6986c1b60ccc5f15..d13f72d8954e26bd093cff822ec6399e17575060 100644 (file)
--- a/common/log.h
+++ b/common/log.h
@@ -1,724 +1,90 @@
  #pragma once
  
-#include <chrono>
-#include <cstring>
-#include <sstream>
-#include <iostream>
-#include <thread>
-#include <vector>
-#include <algorithm>
-#include <cinttypes>
+#include "ggml.h" // for ggml_log_level
  
-// --------------------------------
-//
-// Basic usage:
-//
-// --------
-//
-//  The LOG() and LOG_TEE() macros are ready to go by default
-//   they do not require any initialization.
-//
-//  LOGLN() and LOG_TEELN() are variants which automatically
-//   include \n character at the end of the log string.
-//
-//  LOG() behaves exactly like printf, by default writing to a logfile.
-//  LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ).
-//
-//  Default logfile is named
-//   "llama.<threadID>.log"
-//  Default LOG_TEE() secondary output target is
-//   stderr
-//
-//  Logs can be dynamically disabled or enabled using functions:
-//   log_disable()
-//  and
-//   log_enable()
-//
-//  A log target can be changed with:
-//   log_set_target( string )
-//    creating and opening, or re-opening a file by string filename
-//  or
-//   log_set_target( FILE* )
-//    allowing to point at stderr, stdout, or any valid FILE* file handler.
-//
-// --------
-//
-// End of Basic usage.
-//
-// --------------------------------
-
-// Specifies a log target.
-//  default uses log_handler() with "llama.log" log file
-//  this can be changed, by defining LOG_TARGET
-//  like so:
-//
-//  #define LOG_TARGET (a valid FILE*)
-//  #include "log.h"
-//
-//  or it can be simply redirected to stdout or stderr
-//  like so:
-//
-//  #define LOG_TARGET stderr
-//  #include "log.h"
-//
-//  The log target can also be redirected to a different function
-//  like so:
-//
-//  #define LOG_TARGET log_handler_different()
-//  #include "log.h"
-//
-//  FILE* log_handler_different()
-//  {
-//      return stderr;
-//  }
-//
-//  or:
-//
-//  #define LOG_TARGET log_handler_another_one("somelog.log")
-//  #include "log.h"
-//
-//  FILE* log_handler_another_one(char*filename)
-//  {
-//      static FILE* logfile = nullptr;
-//      (...)
-//      if( !logfile )
-//      {
-//          fopen(...)
-//      }
-//      (...)
-//      return logfile
-//  }
-//
-#ifndef LOG_TARGET
-    #define LOG_TARGET log_handler()
-#endif
-
-#ifndef LOG_TEE_TARGET
-    #define LOG_TEE_TARGET stderr
+#ifndef __GNUC__
+#    define LOG_ATTRIBUTE_FORMAT(...)
+#elif defined(__MINGW32__)
+#    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#else
+#    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
  #endif
  
-// Utility for synchronizing log configuration state
-//  since std::optional was introduced only in c++17
-enum LogTriState
-{
-    LogTriStateSame,
-    LogTriStateFalse,
-    LogTriStateTrue
-};
-
-// Utility to obtain "pid" like unique process id and use it when creating log files.
-inline std::string log_get_pid()
-{
-   static std::string pid;
-   if (pid.empty())
-   {
-       // std::this_thread::get_id() is the most portable way of obtaining a "process id"
-       //  it's not the same as "pid" but is unique enough to solve multiple instances
-       //  trying to write to the same log.
-       std::stringstream ss;
-       ss << std::this_thread::get_id();
-       pid = ss.str();
-   }
-
-   return pid;
-}
-
-// Utility function for generating log file names with unique id based on thread id.
-//  invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
-//  where the number is a runtime id of the current thread.
-
-#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension)
-
-// INTERNAL, DO NOT USE
-inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension)
-{
-    static bool _multilog = false;
-
-    if (multilog != LogTriStateSame)
-    {
-        _multilog = multilog == LogTriStateTrue;
-    }
+#define LOG_DEFAULT_DEBUG 1
+#define LOG_DEFAULT_LLAMA 0
  
-    std::stringstream buf;
+// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
+// set via gpt_log_set_verbosity()
+extern int gpt_log_verbosity_thold;
  
-    buf << log_file_basename;
-    if (_multilog)
-    {
-        buf << ".";
-        buf << log_get_pid();
-    }
-    buf << ".";
-    buf << log_file_extension;
+void gpt_log_set_verbosity_thold(int verbosity); // not thread-safe
  
-    return buf.str();
-}
+// the gpt_log uses an internal worker thread to print/write log messages
+// when the worker thread is paused, incoming log messages are discarded
+struct gpt_log;
  
-#ifndef LOG_DEFAULT_FILE_NAME
-    #define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log")
-#endif
-
-// Utility for turning #define values into string literals
-//  so we can have a define for stderr and
-//  we can print "stderr" instead of literal stderr, etc.
-#define LOG_STRINGIZE1(s) #s
-#define LOG_STRINGIZE(s) LOG_STRINGIZE1(s)
+struct gpt_log * gpt_log_init();
+struct gpt_log * gpt_log_main(); // singleton, automatically destroys itself on exit
+void             gpt_log_pause (struct gpt_log * log); // pause  the worker thread, not thread-safe
+void             gpt_log_resume(struct gpt_log * log); // resume the worker thread, not thread-safe
+void             gpt_log_free  (struct gpt_log * log);
  
-#define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET)
+LOG_ATTRIBUTE_FORMAT(3, 4)
+void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...);
  
-// Allows disabling timestamps.
-//  in order to disable, define LOG_NO_TIMESTAMPS
-//  like so:
+// defaults: file = NULL, colors = false, prefix = false, timestamps = false
  //
-//  #define LOG_NO_TIMESTAMPS
-//  #include "log.h"
+// regular log output:
  //
-#ifndef LOG_NO_TIMESTAMPS
-    #ifndef _MSC_VER
-        #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
-        #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
-    #else
-        #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
-        #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
-    #endif
-#else
-    #define LOG_TIMESTAMP_FMT "%s"
-    #define LOG_TIMESTAMP_VAL ,""
-#endif
-
-#ifdef LOG_TEE_TIMESTAMPS
-    #ifndef _MSC_VER
-        #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
-        #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
-    #else
-        #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
-        #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
-    #endif
-#else
-    #define LOG_TEE_TIMESTAMP_FMT "%s"
-    #define LOG_TEE_TIMESTAMP_VAL ,""
-#endif
-
-// Allows disabling file/line/function prefix
-//  in order to disable, define LOG_NO_FILE_LINE_FUNCTION
-//  like so:
+//   ggml_backend_metal_log_allocated_size: allocated buffer, size =  6695.84 MiB, ( 6695.91 / 21845.34)
+//   llm_load_tensors: ggml ctx size =    0.27 MiB
+//   llm_load_tensors: offloading 32 repeating layers to GPU
+//   llm_load_tensors: offloading non-repeating layers to GPU
  //
-//  #define LOG_NO_FILE_LINE_FUNCTION
-//  #include "log.h"
+// with prefix = true, timestamps = true, the log output will look like this:
  //
-#ifndef LOG_NO_FILE_LINE_FUNCTION
-    #ifndef _MSC_VER
-        #define LOG_FLF_FMT "[%24s:%5d][%24s] "
-        #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
-    #else
-        #define LOG_FLF_FMT "[%24s:%5ld][%24s] "
-        #define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
-    #endif
-#else
-    #define LOG_FLF_FMT "%s"
-    #define LOG_FLF_VAL ,""
-#endif
-
-#ifdef LOG_TEE_FILE_LINE_FUNCTION
-    #ifndef _MSC_VER
-        #define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] "
-        #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
-    #else
-        #define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
-        #define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
-    #endif
-#else
-    #define LOG_TEE_FLF_FMT "%s"
-    #define LOG_TEE_FLF_VAL ,""
-#endif
-
-// INTERNAL, DO NOT USE
-//  USE LOG() INSTEAD
+//   0.00.035.060 D ggml_backend_metal_log_allocated_size: allocated buffer, size =  6695.84 MiB, ( 6695.91 / 21845.34)
+//   0.00.035.064 I llm_load_tensors: ggml ctx size =    0.27 MiB
+//   0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU
+//   0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU
  //
-#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
-    #define LOG_IMPL(str, ...)                                                                                      \
-    do {                                                                                                            \
-        if (LOG_TARGET != nullptr)                                                                                  \
-        {                                                                                                           \
-            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
-            fflush(LOG_TARGET);                                                                                     \
-        }                                                                                                           \
-    } while (0)
-#else
-    #define LOG_IMPL(str, ...)                                                                                           \
-    do {                                                                                                                 \
-        if (LOG_TARGET != nullptr)                                                                                       \
-        {                                                                                                                \
-            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
-            fflush(LOG_TARGET);                                                                                          \
-        }                                                                                                                \
-    } while (0)
-#endif
-
-// INTERNAL, DO NOT USE
-//  USE LOG_TEE() INSTEAD
+// I - info    (stdout, V = 0)
+// W - warning (stderr, V = 0)
+// E - error   (stderr, V = 0)
+// D - debug   (stderr, V = LOG_DEFAULT_DEBUG)
  //
-#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
-    #define LOG_TEE_IMPL(str, ...)                                                                                                      \
-    do {                                                                                                                                \
-        if (LOG_TARGET != nullptr)                                                                                                      \
-        {                                                                                                                               \
-            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__);                     \
-            fflush(LOG_TARGET);                                                                                                         \
-        }                                                                                                                               \
-        if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr)                         \
-        {                                                                                                                               \
-            fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \
-            fflush(LOG_TEE_TARGET);                                                                                                     \
-        }                                                                                                                               \
-    } while (0)
-#else
-    #define LOG_TEE_IMPL(str, ...)                                                                                                           \
-    do {                                                                                                                                     \
-        if (LOG_TARGET != nullptr)                                                                                                           \
-        {                                                                                                                                    \
-            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__);                     \
-            fflush(LOG_TARGET);                                                                                                              \
-        }                                                                                                                                    \
-        if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr)                              \
-        {                                                                                                                                    \
-            fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \
-            fflush(LOG_TEE_TARGET);                                                                                                          \
-        }                                                                                                                                    \
-    } while (0)
-#endif
  
-// The '\0' as a last argument, is a trick to bypass the silly
-//  "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro"
-//  so we can have a single macro which can be called just like printf.
+void gpt_log_set_file      (struct gpt_log * log, const char * file);       // not thread-safe
+void gpt_log_set_colors    (struct gpt_log * log,       bool   colors);     // not thread-safe
+void gpt_log_set_prefix    (struct gpt_log * log,       bool   prefix);     // whether to output prefix to each log
+void gpt_log_set_timestamps(struct gpt_log * log,       bool   timestamps); // whether to output timestamps in the prefix
  
-// Main LOG macro.
-//  behaves like printf, and supports arguments the exact same way.
+// helper macros for logging
+// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
  //
-#if !defined(_MSC_VER) || defined(__clang__)
-    #define LOG(...) LOG_IMPL(__VA_ARGS__, "")
-#else
-    #define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
-#endif
-
-// Main TEE macro.
-//  does the same as LOG
-//  and
-//  simultaneously writes stderr.
+// for example:
  //
-// Secondary target can be changed just like LOG_TARGET
-//  by defining LOG_TEE_TARGET
+//   LOG_DBG("this is a debug message: %d\n", expensive_function());
+//
+// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > gpt_log_verbosity_thold
  //
-#if !defined(_MSC_VER) || defined(__clang__)
-    #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
-#else
-    #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
-#endif
-
-// LOG macro variants with auto endline.
-#if !defined(_MSC_VER) || defined(__clang__)
-    #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
-    #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
-#else
-    #define LOGLN(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
-    #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
-#endif
-
-// INTERNAL, DO NOT USE
-inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
-{
-    static bool _initialized = false;
-    static bool _append = false;
-    static bool _disabled = filename.empty() && target == nullptr;
-    static std::string log_current_filename{filename};
-    static FILE *log_current_target{target};
-    static FILE *logfile = nullptr;
-
-    if (change)
-    {
-        if (append != LogTriStateSame)
-        {
-            _append = append == LogTriStateTrue;
-            return logfile;
-        }
-
-        if (disable == LogTriStateTrue)
-        {
-            // Disable primary target
-            _disabled = true;
-        }
-        // If previously disabled, only enable, and keep previous target
-        else if (disable == LogTriStateFalse)
-        {
-            _disabled = false;
-        }
-        // Otherwise, process the arguments
-        else if (log_current_filename != filename || log_current_target != target)
-        {
-            _initialized = false;
-        }
-    }
-
-    if (_disabled)
-    {
-        // Log is disabled
-        return nullptr;
-    }
-
-    if (_initialized)
-    {
-        // with fallback in case something went wrong
-        return logfile ? logfile : stderr;
-    }
-
-    // do the (re)initialization
-    if (target != nullptr)
-    {
-        if (logfile != nullptr && logfile != stdout && logfile != stderr)
-        {
-            fclose(logfile);
-        }
-
-        log_current_filename = LOG_DEFAULT_FILE_NAME;
-        log_current_target = target;
-
-        logfile = target;
-    }
-    else
-    {
-        if (log_current_filename != filename)
-        {
-            if (logfile != nullptr && logfile != stdout && logfile != stderr)
-            {
-                fclose(logfile);
-            }
-        }
-
-        logfile = fopen(filename.c_str(), _append ? "a" : "w");
-    }
-
-    if (!logfile)
-    {
-        //  Verify whether the file was opened, otherwise fallback to stderr
-        logfile = stderr;
-
-        fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno));
-        fflush(stderr);
-
-        // At this point we let the init flag be to true below, and let the target fallback to stderr
-        //  otherwise we would repeatedly fopen() which was already unsuccessful
-    }
-
-    _initialized = true;
-
-    return logfile ? logfile : stderr;
-}
-
-// INTERNAL, DO NOT USE
-inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
-{
-    return log_handler1_impl(change, append, disable, filename, target);
-}
-
-// Disables logs entirely at runtime.
-//  Makes LOG() and LOG_TEE() produce no output,
-//  until enabled back.
-#define log_disable() log_disable_impl()
-
-// INTERNAL, DO NOT USE
-inline FILE *log_disable_impl()
-{
-    return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue);
-}
-
-// Enables logs at runtime.
-#define log_enable() log_enable_impl()
-
-// INTERNAL, DO NOT USE
-inline FILE *log_enable_impl()
-{
-    return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse);
-}
-
-// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
-#define log_set_target(target) log_set_target_impl(target)
-
-// INTERNAL, DO NOT USE
-inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); }
-inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); }
-
-// INTERNAL, DO NOT USE
-inline FILE *log_handler() { return log_handler1_impl(); }
-
-// Enable or disable creating separate log files for each run.
-//  can ONLY be invoked BEFORE first log use.
-#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "")
-// Enable or disable append mode for log file.
-//  can ONLY be invoked BEFORE first log use.
-#define log_append(enable) log_append_impl(enable)
-// INTERNAL, DO NOT USE
-inline FILE *log_append_impl(bool enable)
-{
-    return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame);
-}
-
-inline void log_test()
-{
-    log_disable();
-    LOG("01 Hello World to nobody, because logs are disabled!\n");
-    log_enable();
-    LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET));
-    LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n");
-    log_set_target(stderr);
-    LOG("04 Hello World to stderr!\n");
-    LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n");
-    log_set_target(LOG_DEFAULT_FILE_NAME);
-    LOG("06 Hello World to default log file!\n");
-    log_set_target(stdout);
-    LOG("07 Hello World to stdout!\n");
-    log_set_target(LOG_DEFAULT_FILE_NAME);
-    LOG("08 Hello World to default log file again!\n");
-    log_disable();
-    LOG("09 Hello World _1_ into the void!\n");
-    log_enable();
-    LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n");
-    log_disable();
-    log_set_target("llama.anotherlog.log");
-    LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n");
-    log_enable();
-    LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n");
-    log_set_target("llama.yetanotherlog.log");
-    LOG("13 Hello World this time in yet new file?\n");
-    log_set_target(log_filename_generator("llama_autonamed", "log"));
-    LOG("14 Hello World in log with generated filename!\n");
-#ifdef _MSC_VER
-    LOG_TEE("15 Hello msvc TEE without arguments\n");
-    LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test");
-    LOG_TEELN("17 Hello msvc TEELN without arguments\n");
-    LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test");
-    LOG("19 Hello msvc LOG without arguments\n");
-    LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test");
-    LOGLN("21 Hello msvc LOGLN without arguments\n");
-    LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test");
-#endif
-}
-
-inline bool log_param_single_parse(const std::string & param)
-{
-    if ( param == "--log-test")
-    {
-        log_test();
-        return true;
-    }
-
-    if ( param == "--log-disable")
-    {
-        log_disable();
-        return true;
-    }
-
-    if ( param == "--log-enable")
-    {
-        log_enable();
-        return true;
-    }
-
-    if (param == "--log-new")
-    {
-        log_multilog(true);
-        return true;
-    }
-
-    if (param == "--log-append")
-    {
-        log_append(true);
-        return true;
-    }
-
-    return false;
-}
-
-inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string())
-{
-    if ( param == "--log-file")
-    {
-        if (!check_but_dont_parse)
-        {
-            log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log"));
-        }
-
-        return true;
-    }
-
-    return false;
-}
-
-inline void log_print_usage()
-{
-    printf("log options:\n");
-    /* format
-    printf("  -h, --help            show this help message and exit\n");*/
-    /* spacing
-    printf("__-param----------------Description\n");*/
-    printf("  --log-test            Run simple logging test\n");
-    printf("  --log-disable         Disable trace logs\n");
-    printf("  --log-enable          Enable trace logs\n");
-    printf("  --log-file            Specify a log filename (without extension)\n");
-    printf("  --log-new             Create a separate new log file on start. "
-                                   "Each log file will have unique name: \"<name>.<ID>.log\"\n");
-    printf("  --log-append          Don't truncate the old log file.\n");
-    printf("\n");
-}
-
-#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
-
-// INTERNAL, DO NOT USE
-inline void log_dump_cmdline_impl(int argc, char **argv)
-{
-    std::stringstream buf;
-    for (int i = 0; i < argc; ++i)
-    {
-        if (std::string(argv[i]).find(' ') != std::string::npos)
-        {
-            buf << " \"" << argv[i] <<"\"";
-        }
-        else
-        {
-            buf << " " << argv[i];
-        }
-    }
-    LOGLN("Cmd:%s", buf.str().c_str());
-}
-
-#define log_tostr(var) log_var_to_string_impl(var).c_str()
-
-inline std::string log_var_to_string_impl(bool var)
-{
-    return var ? "true" : "false";
-}
-
-inline std::string log_var_to_string_impl(std::string var)
-{
-    return var;
-}
-
-inline std::string log_var_to_string_impl(const std::vector<int> & var)
-{
-    std::stringstream buf;
-    buf << "[ ";
-    bool first = true;
-    for (auto e : var)
-    {
-        if (first)
-        {
-            first = false;
-        }
-        else
-        {
-            buf << ", ";
-        }
-        buf << std::to_string(e);
-    }
-    buf << " ]";
-
-    return buf.str();
-}
-
-template <typename C, typename T>
-inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
-{
-    std::stringstream buf;
-    buf << "[ ";
-
-    bool first = true;
-    for (const auto & token : tokens)
-    {
-        if (!first) {
-            buf << ", ";
-        } else {
-            first = false;
-        }
-
-        auto detokenized = llama_token_to_piece(ctx, token);
-
-        detokenized.erase(
-            std::remove_if(
-                detokenized.begin(),
-                detokenized.end(),
-                [](const unsigned char c) { return !std::isprint(c); }),
-            detokenized.end());
-
-        buf
-            << "'" << detokenized << "'"
-            << ":" << std::to_string(token);
-    }
-    buf << " ]";
-
-    return buf.str();
-}
-
-template <typename C, typename B>
-inline std::string LOG_BATCH_TOSTR_PRETTY(const C & ctx, const B & batch)
-{
-    std::stringstream buf;
-    buf << "[ ";
-
-    bool first = true;
-    for (int i = 0; i < batch.n_tokens; ++i)
-    {
-        if (!first) {
-            buf << ", ";
-        } else {
-            first = false;
-        }
-
-        auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
-
-        detokenized.erase(
-            std::remove_if(
-                detokenized.begin(),
-                detokenized.end(),
-                [](const unsigned char c) { return !std::isprint(c); }),
-            detokenized.end());
-
-        buf
-            << "\n" << std::to_string(i)
-            << ":token '" << detokenized << "'"
-            << ":pos " << std::to_string(batch.pos[i])
-            << ":n_seq_id  " << std::to_string(batch.n_seq_id[i])
-            << ":seq_id " << std::to_string(batch.seq_id[i][0])
-            << ":logits " << std::to_string(batch.logits[i]);
-    }
-    buf << " ]";
-
-    return buf.str();
-}
-
-#ifdef LOG_DISABLE_LOGS
-
-#undef LOG
-#define LOG(...) // dummy stub
-#undef LOGLN
-#define LOGLN(...) // dummy stub
-
-#undef LOG_TEE
-#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
-
-#undef LOG_TEELN
-#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
-
-#undef LOG_DISABLE
-#define LOG_DISABLE() // dummy stub
-
-#undef LOG_ENABLE
-#define LOG_ENABLE() // dummy stub
  
-#undef LOG_ENABLE
-#define LOG_ENABLE() // dummy stub
+#define LOG_TMPL(level, verbosity, ...) \
+    do { \
+        if ((verbosity) <= gpt_log_verbosity_thold) { \
+            gpt_log_add(gpt_log_main(), (level), __VA_ARGS__); \
+        } \
+    } while (0)
  
-#undef LOG_SET_TARGET
-#define LOG_SET_TARGET(...) // dummy stub
+#define LOG(...)             LOG_TMPL(GGML_LOG_LEVEL_NONE, 0,         __VA_ARGS__)
+#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
  
-#undef LOG_DUMP_CMDLINE
-#define LOG_DUMP_CMDLINE(...) // dummy stub
+#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  0,                 __VA_ARGS__)
+#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  0,                 __VA_ARGS__)
+#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0,                 __VA_ARGS__)
+#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
  
-#endif // LOG_DISABLE_LOGS
+#define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  verbosity, __VA_ARGS__)
+#define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  verbosity, __VA_ARGS__)
+#define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__)
+#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
diff --git a/common/ngram-cache.cpp b/common/ngram-cache.cpp

index 3ca112ef1613d8a0ad5b4189c627c754964af5b7..7953c723e9ad73ab272b4237a1e842db8a5a8f7c 100644 (file)
--- a/common/ngram-cache.cpp
+++ b/common/ngram-cache.cpp
@@ -2,8 +2,11 @@
  #include "common.h"
  #include "log.h"
  
+#include <cinttypes>
  #include <cstdint>
+#include <cstdio>
  #include <fstream>
+#include <thread>
  
  void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
                                std::vector<llama_token> & inp, int nnew, bool print_progress) {
diff --git a/common/sampling.cpp b/common/sampling.cpp

index c07b5e9409bef93211bbe14e3f4258716c0f4f0c..e51d07611d42c0b8067df53f15e65451e0c59252 100644 (file)
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -325,7 +325,7 @@ llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
  }
  
  std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
-    std::string result = "\tlogits ";
+    std::string result = "logits ";
  
      for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
          const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
diff --git a/common/train.cpp b/common/train.cpp

index fef1e57c9465579cb8ad5007884adb73c197e7e4..661ad8382eab638c0ef1bac96c1f0e0845fe38fc 100644 (file)
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -1,9 +1,11 @@
  #include "train.h"
  #include "common.h"
  
+#include <algorithm>
  #include <random>
  #include <sstream>
  #include <functional>
+#include <cstring>
  
  struct random_normal_distribution {
      std::mt19937 gen;
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp

index ec00fcf78d7ac76f63826c16f47b6b6ef7472956..4a15941f19abe77a40851e6527300ffe7fa3a6fc 100644 (file)
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -1,5 +1,6 @@
  #include "arg.h"
  #include "common.h"
+#include "log.h"
  #include "llama.h"
  
  #include <algorithm>
@@ -8,9 +9,9 @@
  #include <vector>
  
  static void print_usage(int, char ** argv) {
-    LOG_TEE("\nexample usage:\n");
-    LOG_TEE("\n    %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
-    LOG_TEE("\n");
+    LOG("\nexample usage:\n");
+    LOG("\n    %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
+    LOG("\n");
  }
  
  int main(int argc, char ** argv) {
@@ -20,6 +21,8 @@ int main(int argc, char ** argv) {
          return 1;
      }
  
+    gpt_init();
+
      int is_pp_shared = params.is_pp_shared;
  
      std::vector<int> n_pp = params.n_pp;
@@ -76,7 +79,7 @@ int main(int argc, char ** argv) {
  
              const int ret = llama_decode(ctx, batch_view);
              if (ret != 0) {
-                LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
+                LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
                  return false;
              }
  
@@ -93,17 +96,17 @@ int main(int argc, char ** argv) {
          }
  
          if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-            LOG_TEE("%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
              return 1;
          }
      }
  
      if (!params.batched_bench_output_jsonl) {
-        LOG_TEE("\n");
-        LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
-        LOG_TEE("\n");
-        LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
-        LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
+        LOG("\n");
+        LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
+        LOG("\n");
+        LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
+        LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
      }
  
      for (        int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
@@ -133,7 +136,7 @@ int main(int argc, char ** argv) {
                  llama_kv_cache_clear(ctx);
  
                  if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-                    LOG_TEE("%s: llama_decode() failed\n", __func__);
+                    LOG_ERR("%s: llama_decode() failed\n", __func__);
                      return 1;
                  }
  
@@ -155,7 +158,7 @@ int main(int argc, char ** argv) {
                      }
  
                      if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-                        LOG_TEE("%s: llama_decode() failed\n", __func__);
+                        LOG_ERR("%s: llama_decode() failed\n", __func__);
                          return 1;
                      }
                  }
@@ -173,20 +176,20 @@ int main(int argc, char ** argv) {
                  const float speed    = n_kv / t;
  
                  if(params.batched_bench_output_jsonl) {
-                    LOG_TEE(
+                    LOG(
                          "{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
                          "\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
                          n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
                          pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
                      );
                  } else {
-                    LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
+                    LOG("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
                  }
              }
          }
      }
  
-    LOG_TEE("\n");
+    LOG("\n");
      llama_perf_context_print(ctx);
  
      llama_batch_free(batch);
@@ -196,7 +199,7 @@ int main(int argc, char ** argv) {
  
      llama_backend_free();
  
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
  
      return 0;
  }
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp

index f1df20c6ecf096907c095c1b8d8db254109f35f5..7887a43d62fdbb3af543922e00d1032200932290 100644 (file)
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -1,5 +1,6 @@
  #include "arg.h"
  #include "common.h"
+#include "log.h"
  #include "llama.h"
  
  #include <algorithm>
@@ -8,9 +9,9 @@
  #include <vector>
  
  static void print_usage(int, char ** argv) {
-    LOG_TEE("\nexample usage:\n");
-    LOG_TEE("\n    %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
-    LOG_TEE("\n");
+    LOG("\nexample usage:\n");
+    LOG("\n    %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
+    LOG("\n");
  }
  
  int main(int argc, char ** argv) {
@@ -23,6 +24,7 @@ int main(int argc, char ** argv) {
          return 1;
      }
  
+    gpt_init();
  
      // number of parallel batches
      int n_parallel = params.n_parallel;
@@ -42,7 +44,7 @@ int main(int argc, char ** argv) {
      llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
  
      if (model == NULL) {
-        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+        LOG_ERR("%s: error: unable to load model\n" , __func__);
          return 1;
      }
  
@@ -72,31 +74,29 @@ int main(int argc, char ** argv) {
      llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
  
      if (ctx == NULL) {
-        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
          return 1;
      }
  
      const int n_ctx = llama_n_ctx(ctx);
  
-    LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
+    LOG_INF("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
  
      // make sure the KV cache is big enough to hold all the prompt and generated tokens
      if (n_kv_req > n_ctx) {
-        LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__,  n_kv_req);
-        LOG_TEE("%s:        either reduce n_parallel or increase n_ctx\n", __func__);
+        LOG_ERR("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__,  n_kv_req);
+        LOG_ERR("%s:        either reduce n_parallel or increase n_ctx\n", __func__);
          return 1;
      }
  
      // print the prompt token-by-token
  
-    fprintf(stderr, "\n");
+    LOG("\n");
  
      for (auto id : tokens_list) {
-        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
+        LOG("%s", llama_token_to_piece(ctx, id).c_str());
      }
  
-    fflush(stderr);
-
      // create a llama_batch
      // we use this object to submit token data for decoding
      llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
@@ -114,7 +114,7 @@ int main(int argc, char ** argv) {
  
      if (llama_model_has_encoder(model)) {
          if (llama_encode(ctx, batch)) {
-            LOG_TEE("%s : failed to eval\n", __func__);
+            LOG_ERR("%s : failed to eval\n", __func__);
              return 1;
          }
  
@@ -131,7 +131,7 @@ int main(int argc, char ** argv) {
      batch.logits[batch.n_tokens - 1] = true;
  
      if (llama_decode(ctx, batch) != 0) {
-        LOG_TEE("%s: llama_decode() failed\n", __func__);
+        LOG_ERR("%s: llama_decode() failed\n", __func__);
          return 1;
      }
  
@@ -142,7 +142,7 @@ int main(int argc, char ** argv) {
      //}
  
      if (n_parallel > 1) {
-        LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
+        LOG("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
      }
  
      // main loop
@@ -175,9 +175,9 @@ int main(int argc, char ** argv) {
              // is it an end of generation? -> mark the stream as finished
              if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
                  i_batch[i] = -1;
-                LOG_TEE("\n");
+                LOG("\n");
                  if (n_parallel > 1) {
-                    LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
+                    LOG_INF("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
                  }
  
                  continue;
@@ -185,8 +185,7 @@ int main(int argc, char ** argv) {
  
              // if there is only one stream, we print immediately to stdout
              if (n_parallel == 1) {
-                LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
-                fflush(stdout);
+                LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
              }
  
              streams[i] += llama_token_to_piece(ctx, new_token_id);
@@ -208,27 +207,25 @@ int main(int argc, char ** argv) {
  
          // evaluate the current batch with the transformer model
          if (llama_decode(ctx, batch)) {
-            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
+            LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
              return 1;
          }
      }
  
-    LOG_TEE("\n");
-
      if (n_parallel > 1) {
-        LOG_TEE("\n");
+        LOG("\n");
  
          for (int32_t i = 0; i < n_parallel; ++i) {
-            LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
+            LOG("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
          }
      }
  
      const auto t_main_end = ggml_time_us();
  
-    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
+    LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
              __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
  
-    LOG_TEE("\n");
+    LOG("\n");
      llama_perf_sampler_print(smpl);
      llama_perf_context_print(ctx);
  
diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp

index 8ca9f8915916ccd5122c92f47277f16a9a871074..ecff95f9a69de438d5ad32954394d26391f3c172 100644 (file)
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -9,6 +9,7 @@
  #include <climits>
  #include <cstring>
  #include <cstdarg>
+#include <cinttypes>
  #include <ctime>
  #include <random>
  #include <stdexcept>
@@ -105,43 +106,43 @@ static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_
      const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
      try {
          w->token_embedding_table.resize(p->vocab_size * p->dim);
-        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
  
          w->rms_att_weight.resize(p->n_layers * p->dim);
-        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
  
          w->rms_ffn_weight.resize(p->n_layers * p->dim);
-        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
  
          w->wq.resize(p->n_layers * p->dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
  
          w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
  
          w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
  
          w->wo.resize(p->n_layers * p->dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
  
          w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
  
          w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
  
          w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
  
          w->rms_final_weight.resize(p->dim);
-        LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
+        LOG_INF("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
  
          if (shared_weights) {
              w->wcls = {};
          } else {
              w->wcls.resize(p->vocab_size * p->dim);
-            LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+            LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
          }
      }
      catch (std::length_error &) {
@@ -173,7 +174,7 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
      fseek(f, 0, SEEK_END);
      auto end = ftell(f);
      if (curr != end) {
-        LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end =  %ld)\n", __func__, curr, end);
+        LOG_ERR("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end =  %ld)\n", __func__, curr, end);
          return 1;
      }
  
@@ -181,20 +182,20 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
  }
  
  static void print_sample_weights(TransformerWeights *w){
-    LOG("----- Quick print of first of the weight vales of all the variables\n");
-    LOG("%f\n", w->token_embedding_table[0]);
-    LOG("%f\n", w->rms_att_weight[0]);
-    LOG("%f\n", w->rms_ffn_weight[0]);
-
-    LOG("%f\n", w->wq[0]);
-    LOG("%f\n", w->wk[0]);
-    LOG("%f\n", w->wv[0]);
-    LOG("%f\n", w->wo[0]);
-    LOG("%f\n", w->w1[0]);
-    LOG("%f\n", w->w2[0]);
-    LOG("%f\n", w->w3[0]);
-    LOG("%f\n", w->rms_att_weight[0]);
-    if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]);
+    LOG_INF("----- Quick print of first of the weight vales of all the variables\n");
+    LOG_INF("%f\n", w->token_embedding_table[0]);
+    LOG_INF("%f\n", w->rms_att_weight[0]);
+    LOG_INF("%f\n", w->rms_ffn_weight[0]);
+
+    LOG_INF("%f\n", w->wq[0]);
+    LOG_INF("%f\n", w->wk[0]);
+    LOG_INF("%f\n", w->wv[0]);
+    LOG_INF("%f\n", w->wo[0]);
+    LOG_INF("%f\n", w->w1[0]);
+    LOG_INF("%f\n", w->w2[0]);
+    LOG_INF("%f\n", w->w3[0]);
+    LOG_INF("%f\n", w->rms_att_weight[0]);
+    if (!w->wcls.empty()) LOG_INF("%f\n", w->wcls[0]);
  }
  ////////////////////////////////////////////////////////////////////////////////////////////////////////////
  
@@ -318,20 +319,20 @@ struct train_params {
  };
  
  static void print_params(struct my_llama_hparams * params) {
-    LOG("%s: n_vocab:   %u\n", __func__, params->n_vocab);
-    LOG("%s: n_ctx:     %u\n", __func__, params->n_ctx);
-    LOG("%s: n_embd:    %u\n", __func__, params->n_embd);
-    LOG("%s: n_mult:    %u\n", __func__, params->n_mult);
-    LOG("%s: n_head:    %u\n", __func__, params->n_head);
-    LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
-    LOG("%s: n_ff:      %u\n", __func__, params->n_ff);
-    LOG("%s: n_layer:   %u\n", __func__, params->n_layer);
-    LOG("%s: n_rot:     %u\n", __func__, params->n_rot);
+    LOG_INF("%s: n_vocab:   %u\n", __func__, params->n_vocab);
+    LOG_INF("%s: n_ctx:     %u\n", __func__, params->n_ctx);
+    LOG_INF("%s: n_embd:    %u\n", __func__, params->n_embd);
+    LOG_INF("%s: n_mult:    %u\n", __func__, params->n_mult);
+    LOG_INF("%s: n_head:    %u\n", __func__, params->n_head);
+    LOG_INF("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
+    LOG_INF("%s: n_ff:      %u\n", __func__, params->n_ff);
+    LOG_INF("%s: n_layer:   %u\n", __func__, params->n_layer);
+    LOG_INF("%s: n_rot:     %u\n", __func__, params->n_rot);
  }
  
  static void print_tensor_info(const struct ggml_context * ctx) {
      for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-        LOG("%s: Allocating ", __func__);
+        LOG_INF("%s: Allocating ", __func__);
          int64_t total = 1;
          int i = 0;
          for (; i < ggml_n_dims(t); ++i) {
@@ -526,7 +527,7 @@ static std::string llama_escape_whitespaces(const std::string & text) {
  
  static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) {
      if (is_ggml_file(filename)) {
-        LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
+        LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
          struct ggml_context * ctx_data = NULL;
  
          struct gguf_init_params params = {
@@ -574,7 +575,7 @@ static void load_vocab(const char * filename, const Config * config, struct llam
          gguf_free(ctx);
      } else {
          // assume llama2.c vocabulary
-        LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
+        LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
          llama_file file(filename, "rb");
          if (!file.fp) {
              die_fmt("%s: %s", strerror(errno), filename);
@@ -871,23 +872,25 @@ static std::string basename(const std::string &path) {
  }
  
  int main(int argc, char ** argv) {
+    gpt_init();
+
      struct train_params params = get_default_train_params();
      if (!params_parse(argc, argv, &params)) {
          return 1;
      }
-    log_set_target(stdout);
+
      Config config;
      TransformerWeights weights = {};
      {
-        LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
+        LOG_INF("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
          FILE * file = fopen(params.fn_llama2c_model, "rb");
          if (!file) {
-            LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
+            LOG_ERR("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
              return 1;
          }
          // read in the config header
          if (fread(&config, sizeof(Config), 1, file) != 1) {
-            LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
+            LOG_ERR("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
              return 1;
          }
          auto shared_weights = config.vocab_size > 0;
@@ -896,7 +899,7 @@ int main(int argc, char ** argv) {
          // read in the Transformer weights
          alloc_weights(&weights, &config, shared_weights);
          if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
-            LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
+            LOG_ERR("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
              return 1;
          }
          fclose(file);
@@ -929,7 +932,7 @@ int main(int argc, char ** argv) {
      model.name = basename(params.fn_llama2c_model);
      save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
  
-    LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
+    LOG_INF("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
  
      ggml_free(model.ctx);
      return 0;
diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp

index 569b6c38f5bd947b8d529aa9b96e46ec18cc2821..41bf4eb2a406c42dc847033e6e5daa015ab120c4 100644 (file)
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -13,14 +13,15 @@
  #include "ggml-metal.h"
  #endif
  
+#include <algorithm>
+#include <climits>
  #include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <iostream>
  #include <string>
  #include <tuple>
  #include <vector>
-#include <algorithm>
-#include <iostream>
-#include <fstream>
-#include <climits>
  
  
  //////////////////////////////////////////////////
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp

index e94ae295558ba356e03caf0ba367c741a7a98162..a438dcb5adf34f71085e8c72306ae789e4e100c5 100644 (file)
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -1,5 +1,6 @@
  #include "arg.h"
  #include "common.h"
+#include "log.h"
  #include "llama.h"
  
  #include <ctime>
@@ -39,16 +40,16 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
      llama_kv_cache_clear(ctx);
  
      // run model
-    fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
+    LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
      if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
          // encoder-only model
          if (llama_encode(ctx, batch) < 0) {
-            fprintf(stderr, "%s : failed to encode\n", __func__);
+            LOG_ERR("%s : failed to encode\n", __func__);
          }
      } else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
          // decoder-only model
          if (llama_decode(ctx, batch) < 0) {
-            fprintf(stderr, "%s : failed to decode\n", __func__);
+            LOG_ERR("%s : failed to decode\n", __func__);
          }
      }
  
@@ -84,12 +85,12 @@ int main(int argc, char ** argv) {
          return 1;
      }
  
+    gpt_init();
+
      params.embedding = true;
      // For non-causal models, batch size must be equal to ubatch size
      params.n_ubatch = params.n_batch;
  
-    print_build_info();
-
      llama_backend_init();
      llama_numa_init(params.numa);
  
@@ -99,7 +100,7 @@ int main(int argc, char ** argv) {
      llama_model * model = llama_init.model;
      llama_context * ctx = llama_init.context;
      if (model == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        LOG_ERR("%s: unable to load model\n", __func__);
          return 1;
      }
  
@@ -109,19 +110,19 @@ int main(int argc, char ** argv) {
      const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
  
      if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
-        fprintf(stderr, "%s: error: computing embeddings in encoder-decoder models is not supported\n", __func__);
+        LOG_ERR("%s: computing embeddings in encoder-decoder models is not supported\n", __func__);
          return 1;
      }
  
      if (n_ctx > n_ctx_train) {
-        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
+        LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
                  __func__, n_ctx_train, n_ctx);
      }
  
      // print system information
      {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
      }
  
      // split the prompt into lines
@@ -136,7 +137,7 @@ int main(int argc, char ** argv) {
      for (const auto & prompt : prompts) {
          auto inp = ::llama_tokenize(ctx, prompt, true, false);
          if (inp.size() > n_batch) {
-            fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
+            LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
                      __func__, (long long int) inp.size(), (long long int) n_batch);
              return 1;
          }
@@ -147,20 +148,20 @@ int main(int argc, char ** argv) {
      // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
      for (auto & inp : inputs) {
          if (inp.empty() || inp.back() != llama_token_sep(model)) {
-            fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__);
-            fprintf(stderr, "%s:          'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
+            LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
+            LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
          }
      }
  
      // tokenization stats
      if (params.verbose_prompt) {
          for (int i = 0; i < (int) inputs.size(); i++) {
-            fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
-            fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
+            LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
+            LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
              for (int j = 0; j < (int) inputs[i].size(); j++) {
-                fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
+                LOG("%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
              }
-            fprintf(stderr, "\n\n");
+            LOG("\n\n");
          }
      }
  
@@ -211,57 +212,57 @@ int main(int argc, char ** argv) {
      batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
  
      if (params.embd_out.empty()) {
-        fprintf(stdout, "\n");
+        LOG("\n");
  
          if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
              for (int j = 0; j < n_embd_count; j++) {
-                fprintf(stdout, "embedding %d: ", j);
+                LOG("embedding %d: ", j);
                  for (int i = 0; i < std::min(3, n_embd); i++) {
                      if (params.embd_normalize == 0) {
-                        fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
+                        LOG("%6.0f ", emb[j * n_embd + i]);
                      } else {
-                        fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
+                        LOG("%9.6f ", emb[j * n_embd + i]);
                      }
                  }
-                fprintf(stdout, " ... ");
+                LOG(" ... ");
                  for (int i = n_embd - 3; i < n_embd; i++) {
                      if (params.embd_normalize == 0) {
-                        fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
+                        LOG("%6.0f ", emb[j * n_embd + i]);
                      } else {
-                        fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
+                        LOG("%9.6f ", emb[j * n_embd + i]);
                      }
                  }
-                fprintf(stdout, "\n");
+                LOG("\n");
              }
          } else {
              // print the first part of the embeddings or for a single prompt, the full embedding
              for (int j = 0; j < n_prompts; j++) {
-                fprintf(stdout, "embedding %d: ", j);
+                LOG("embedding %d: ", j);
                  for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
                      if (params.embd_normalize == 0) {
-                        fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
+                        LOG("%6.0f ", emb[j * n_embd + i]);
                      } else {
-                        fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
+                        LOG("%9.6f ", emb[j * n_embd + i]);
                      }
                  }
-                fprintf(stdout, "\n");
+                LOG("\n");
              }
  
              // print cosine similarity matrix
              if (n_prompts > 1) {
-                fprintf(stdout, "\n");
-                printf("cosine similarity matrix:\n\n");
+                LOG("\n");
+                LOG("cosine similarity matrix:\n\n");
                  for (int i = 0; i < n_prompts; i++) {
-                    fprintf(stdout, "%6.6s ", prompts[i].c_str());
+                    LOG("%6.6s ", prompts[i].c_str());
                  }
-                fprintf(stdout, "\n");
+                LOG("\n");
                  for (int i = 0; i < n_prompts; i++) {
                      for (int j = 0; j < n_prompts; j++) {
                          float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
-                        fprintf(stdout, "%6.2f ", sim);
+                        LOG("%6.2f ", sim);
                      }
-                    fprintf(stdout, "%1.10s", prompts[i].c_str());
-                    fprintf(stdout, "\n");
+                    LOG("%1.10s", prompts[i].c_str());
+                    LOG("\n");
                  }
              }
          }
@@ -270,42 +271,42 @@ int main(int argc, char ** argv) {
      if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
          const bool notArray = params.embd_out != "array";
  
-        fprintf(stdout, notArray ? "{\n  \"object\": \"list\",\n  \"data\": [\n" : "[");
+        LOG(notArray ? "{\n  \"object\": \"list\",\n  \"data\": [\n" : "[");
          for (int j = 0;;) { // at least one iteration (one prompt)
-            if (notArray) fprintf(stdout, "    {\n      \"object\": \"embedding\",\n      \"index\": %d,\n      \"embedding\": ",j);
-            fprintf(stdout, "[");
+            if (notArray) LOG("    {\n      \"object\": \"embedding\",\n      \"index\": %d,\n      \"embedding\": ",j);
+            LOG("[");
              for (int i = 0;;) { // at least one iteration (n_embd > 0)
-                fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
+                LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
                  i++;
-                if (i < n_embd) fprintf(stdout, ","); else break;
+                if (i < n_embd) LOG(","); else break;
              }
-            fprintf(stdout, notArray ? "]\n    }" : "]");
+            LOG(notArray ? "]\n    }" : "]");
              j++;
-            if (j < n_embd_count) fprintf(stdout, notArray ? ",\n" : ","); else break;
+            if (j < n_embd_count) LOG(notArray ? ",\n" : ","); else break;
          }
-        fprintf(stdout, notArray ? "\n  ]" : "]\n");
+        LOG(notArray ? "\n  ]" : "]\n");
  
          if (params.embd_out == "json+" && n_prompts > 1) {
-            fprintf(stdout, ",\n  \"cosineSimilarity\": [\n");
+            LOG(",\n  \"cosineSimilarity\": [\n");
              for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
-                fprintf(stdout, "    [");
+                LOG("    [");
                  for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
                      float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
-                    fprintf(stdout, "%6.2f", sim);
+                    LOG("%6.2f", sim);
                      j++;
-                    if (j < n_embd_count) fprintf(stdout, ", "); else break;
+                    if (j < n_embd_count) LOG(", "); else break;
                  }
-                fprintf(stdout, " ]");
+                LOG(" ]");
                  i++;
-                if (i < n_embd_count) fprintf(stdout, ",\n"); else break;
+                if (i < n_embd_count) LOG(",\n"); else break;
              }
-            fprintf(stdout, "\n  ]");
+            LOG("\n  ]");
          }
  
-        if (notArray) fprintf(stdout, "\n}\n");
+        if (notArray) LOG("\n}\n");
      }
  
-    LOG_TEE("\n");
+    LOG("\n");
      llama_perf_context_print(ctx);
  
      // clean up
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp

index af389abe1aac15cfd56989b892a1aec32daed0c3..6d629fe4ef189ebd705a046c74232e214a074c65 100644 (file)
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -1,12 +1,11 @@
  #include "arg.h"
  #include "common.h"
+#include "log.h"
  #include "llama.h"
  #include "ggml.h"
  
  #include <cstdio>
-#include <random>
  #include <string>
-#include <tuple>
  #include <vector>
  
  /**
@@ -32,22 +31,22 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
      GGML_ASSERT(n > 0);
      float sum = 0;
      for (int64_t i3 = 0; i3 < ne[3]; i3++) {
-        printf("                                     [\n");
+        LOG("                                     [\n");
          for (int64_t i2 = 0; i2 < ne[2]; i2++) {
              if (i2 == n && ne[2] > 2*n) {
-                printf("                                      ..., \n");
+                LOG("                                      ..., \n");
                  i2 = ne[2] - n;
              }
-            printf("                                      [\n");
+            LOG("                                      [\n");
              for (int64_t i1 = 0; i1 < ne[1]; i1++) {
                  if (i1 == n && ne[1] > 2*n) {
-                    printf("                                       ..., \n");
+                    LOG("                                       ..., \n");
                      i1 = ne[1] - n;
                  }
-                printf("                                       [");
+                LOG("                                       [");
                  for (int64_t i0 = 0; i0 < ne[0]; i0++) {
                      if (i0 == n && ne[0] > 2*n) {
-                        printf("..., ");
+                        LOG("..., ");
                          i0 = ne[0] - n;
                      }
                      size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
@@ -65,16 +64,16 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
                      } else {
                          GGML_ABORT("fatal error");
                      }
-                    printf("%12.4f", v);
+                    LOG("%12.4f", v);
                      sum += v;
-                    if (i0 < ne[0] - 1) printf(", ");
+                    if (i0 < ne[0] - 1) LOG(", ");
                  }
-                printf("],\n");
+                LOG("],\n");
              }
-            printf("                                      ],\n");
+            LOG("                                      ],\n");
          }
-        printf("                                     ]\n");
-        printf("                                     sum = %f\n", sum);
+        LOG("                                     ]\n");
+        LOG("                                     sum = %f\n", sum);
      }
  }
  
@@ -103,11 +102,11 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
          snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
      }
  
-    printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
-           t->name, ggml_type_name(t->type), ggml_op_desc(t),
-           src0->name, ggml_ne_string(src0).c_str(),
-           src1 ? src1_str : "",
-           ggml_ne_string(t).c_str());
+    LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
+         t->name, ggml_type_name(t->type), ggml_op_desc(t),
+         src0->name, ggml_ne_string(src0).c_str(),
+         src1 ? src1_str : "",
+         ggml_ne_string(t).c_str());
  
  
      // copy the data from the GPU memory if needed
@@ -133,7 +132,7 @@ static bool run(llama_context * ctx, const gpt_params & params) {
      std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
  
      if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
-        fprintf(stderr, "%s : failed to eval\n", __func__);
+        LOG_ERR("%s : failed to eval\n", __func__);
          return false;
      }
  
@@ -149,7 +148,7 @@ int main(int argc, char ** argv) {
          return 1;
      }
  
-    print_build_info();
+    gpt_init();
  
      llama_backend_init();
      llama_numa_init(params.numa);
@@ -166,14 +165,15 @@ int main(int argc, char ** argv) {
      llama_model * model = llama_init.model;
      llama_context * ctx = llama_init.context;
      if (model == nullptr || ctx == nullptr) {
-        fprintf(stderr, "%s : failed to init\n", __func__);
+        LOG_ERR("%s : failed to init\n", __func__);
          return 1;
      }
  
      // print system information
      {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
      }
  
      bool OK = run(ctx, params);
@@ -181,7 +181,7 @@ int main(int argc, char ** argv) {
          return 1;
      }
  
-    LOG_TEE("\n");
+    LOG("\n");
      llama_perf_context_print(ctx);
  
      llama_free(ctx);
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp

index 90126ad1e9075b31b26b85b72e52f5207be8e36b..0051a5eb65cbe353b172cf9ad393e83ab3901338 100644 (file)
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -406,7 +406,7 @@ int main(int argc, char ** argv) {
          return 1;
      }
  
-    g_verbose = (params.verbosity == 1);
+    g_verbose = (params.verbosity > 1);
      try {
          lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
          ctx.run_merge();
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp

index 14c71520213668050419757c1d413ef5bb4468d5..20b99a4fd347869af435c233628f561408d51310 100644 (file)
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -158,6 +158,8 @@ int main(int argc, char * argv[]) {
          return 1;
      }
  
+    gpt_init();
+
      llama_model_params mparams = llama_model_params_from_gpt_params(params);
      llama_context_params cparams = llama_context_params_from_gpt_params(params);
  
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp

index 73b54da7fd4a92a8967a5a30357756543469a711..26528169978f8ebecca47644ad04e59f84b0e817 100644 (file)
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -1,5 +1,6 @@
  #include "arg.h"
  #include "common.h"
+#include "log.h"
  #include "llama.h"
  
  #include <cmath>
@@ -19,12 +20,12 @@
  #endif
  
  static void print_usage(int, char ** argv) {
-    LOG_TEE("\nexample usage:\n");
-    LOG_TEE("\n    %s \\\n"
-            "       -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
+    LOG("\nexample usage:\n");
+    LOG("\n    %s \\\n"
+            "       -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \\\n"
              "       [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
              "       [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
-    LOG_TEE("\n");
+    LOG("\n");
  }
  
  struct Stats {
@@ -125,12 +126,10 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
              e.counts.resize(src1->ne[0]*n_as, 0);
          }
          else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
-            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
+            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
              exit(1); //GGML_ABORT("fatal error");
          }
-        if (m_params.verbosity > 1) {
-            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
-        }
+        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
          // loop over all possible experts, regardless if they are used or not in the batch
          for (int ex = 0; ex < n_as; ++ex) {
              size_t e_start = ex*src1->ne[0];
@@ -151,7 +150,8 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
                          e.values[e_start + j] += x[j]*x[j];
                          e.counts[e_start + j]++;
                          if (!std::isfinite(e.values[e_start + j])) {
-                            fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str());
+                            LOG("\n");
+                            LOG_ERR("%f detected in %s\n", e.values[e_start + j], wname.c_str());
                              exit(1);
                          }
                      }
@@ -174,20 +174,18 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
              e.counts.resize(src1->ne[0], 0);
          }
          else if (e.values.size() != (size_t)src1->ne[0]) {
-            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
+            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
              exit(1); //GGML_ABORT("fatal error");
          }
          ++e.ncall;
-        if (m_params.verbosity > 1) {
-            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
-        }
+        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
          for (int row = 0; row < (int)src1->ne[1]; ++row) {
              const float * x = data + row * src1->ne[0];
              for (int j = 0; j < (int)src1->ne[0]; ++j) {
                  e.values[j] += x[j]*x[j];
                  e.counts[j]++;
                  if (!std::isfinite(e.values[j])) {
-                    fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
+                    LOG_ERR("%f detected in %s\n", e.values[j], wname.c_str());
                      exit(1);
                  }
              }
@@ -239,17 +237,17 @@ void IMatrixCollector::save_imatrix(int ncall) const {
          }
  
          if (n_zeros != 0 && is_first) {
-            fprintf(stderr, "\n");
+            LOG_INF("\n");
              is_first = false;
          }
  
          if (n_zeros == n_all) {
-            fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
+            LOG_WRN("%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
              continue;
          }
  
          if (n_zeros > 0) {
-            fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
+            LOG_WRN("%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
              continue;
          }
  
@@ -258,7 +256,7 @@ void IMatrixCollector::save_imatrix(int ncall) const {
      }
  
      if (to_store.size() < m_stats.size()) {
-        fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
+        LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
      }
  
      std::ofstream out(fname, std::ios::binary);
@@ -290,21 +288,20 @@ void IMatrixCollector::save_imatrix(int ncall) const {
          out.write(m_params.prompt_file.c_str(), len);
      }
  
-    if (m_params.verbosity > 0) {
-        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
-    }
+    LOGV(1, "\n");
+    LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
  }
  
  bool IMatrixCollector::load_imatrix(const char * fname) {
      std::ifstream in(fname, std::ios::binary);
      if (!in) {
-        printf("%s: failed to open %s\n",__func__, fname);
+        LOG_ERR("%s: failed to open %s\n",__func__, fname);
          return false;
      }
      int n_entries;
      in.read((char*)&n_entries, sizeof(n_entries));
      if (in.fail() || n_entries < 1) {
-        printf("%s: no data in file %s\n", __func__, fname);
+        LOG_ERR("%s: no data in file %s\n", __func__, fname);
          return false;
      }
      for (int i = 0; i < n_entries; ++i) {
@@ -312,7 +309,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
          std::vector<char> name_as_vec(len+1);
          in.read((char *)name_as_vec.data(), len);
          if (in.fail()) {
-            printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
+            LOG_ERR("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
              return false;
          }
          name_as_vec[len] = 0;
@@ -323,7 +320,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
          int nval;
          in.read((char *)&nval, sizeof(nval));
          if (in.fail() || nval < 1) {
-            printf("%s: failed reading number of values for entry %d\n",__func__,i);
+            LOG_ERR("%s: failed reading number of values for entry %d\n",__func__,i);
              m_stats = {};
              return false;
          }
@@ -336,7 +333,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
          std::vector<float> tmp(nval);
          in.read((char*)tmp.data(), nval*sizeof(float));
          if (in.fail()) {
-            printf("%s: failed reading data for entry %d\n",__func__,i);
+            LOG_ERR("%s: failed reading data for entry %d\n",__func__,i);
              m_stats = {};
              return false;
          }
@@ -437,26 +434,25 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
      const int n_ctx = llama_n_ctx(ctx);
  
      auto tim1 = std::chrono::high_resolution_clock::now();
-    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
+    LOG_INF("%s: tokenizing the input ..\n", __func__);
  
      std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
  
      auto tim2 = std::chrono::high_resolution_clock::now();
-    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
+    LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
  
      if (params.i_chunk > 0) {
          if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
-            fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
+            LOG_ERR("%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
              return false;
          }
-        fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
+        LOG_INF("%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
          tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
      }
  
      if (int(tokens.size()) < 2*n_ctx) {
-        fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
-                n_ctx);
-        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
+        LOG_ERR("%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2*n_ctx, n_ctx);
+        LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n", __func__, tokens.size());
          return false;
      }
  
@@ -478,7 +474,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
      double nll = 0.0;
      double nll2 = 0.0;
  
-    fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
+    LOG_INF("%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
  
      std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
  
@@ -514,7 +510,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
  
              // TODO: use batch.logits to save computations instead of relying on logits_all == true
              if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
-                fprintf(stderr, "%s : failed to eval\n", __func__);
+                LOG_ERR("%s : failed to eval\n", __func__);
                  return false;
              }
  
@@ -531,29 +527,29 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
  
          if (i == 0) {
              const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
              int total_seconds = (int)(t_total * n_chunk);
              if (total_seconds >= 60*60) {
-                fprintf(stderr, "%d hours ", total_seconds / (60*60));
+                LOG("%d hours ", total_seconds / (60*60));
                  total_seconds = total_seconds % (60*60);
              }
-            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
+            LOG("%.2f minutes\n", total_seconds / 60.0);
          }
  
          if (params.compute_ppl) {
              const int first = n_ctx/2;
-            const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
+            const auto * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
              process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
                      workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
              count += n_ctx - first - 1;
  
-            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+            LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
              fflush(stdout);
  
              logits.clear();
          }
      }
-    printf("\n");
+    LOG("\n");
  
      if (params.compute_ppl) {
          nll2 /= count;
@@ -562,9 +558,9 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
          nll2 -= nll * nll;
          if (nll2 > 0) {
              nll2 = sqrt(nll2/(count-1));
-            printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
+            LOG("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
          } else {
-            printf("Unexpected negative standard deviation of log(prob)\n");
+            LOG("Unexpected negative standard deviation of log(prob)\n");
          }
      }
  
@@ -576,26 +572,27 @@ int main(int argc, char ** argv) {
  
      params.n_ctx = 512;
      params.logits_all = true;
-    params.verbosity = 1;
  
      if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
          return 1;
      }
  
+    gpt_init();
+
      params.n_batch = std::min(params.n_batch, params.n_ctx);
  
      g_collector.set_params(params);
  
      for (const auto & in_file : params.in_files) {
-        printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
+        LOG_INF("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
          if (!g_collector.load_imatrix(in_file.c_str())) {
-            fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str());
+            LOG_ERR("%s : failed to load %s\n", __func__, in_file.c_str());
              return 1;
          }
      }
  
      if (params.in_files.size() > 1) {
-        printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
+        LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
          g_collector.save_imatrix();
      }
  
@@ -614,20 +611,20 @@ int main(int argc, char ** argv) {
      llama_model * model = llama_init.model;
      llama_context * ctx = llama_init.context;
      if (model == nullptr || ctx == nullptr) {
-        fprintf(stderr, "%s : failed to init\n", __func__);
+        LOG_ERR("%s : failed to init\n", __func__);
          return 1;
      }
  
      const int n_ctx_train = llama_n_ctx_train(model);
      if (params.n_ctx > n_ctx_train) {
-        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
+        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
                  __func__, n_ctx_train, params.n_ctx);
      }
  
      // print system information
      {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
      }
  
      if (!compute_imatrix(ctx, params)) {
@@ -636,7 +633,7 @@ int main(int argc, char ** argv) {
  
      g_collector.save_imatrix();
  
-    LOG_TEE("\n");
+    LOG("\n");
      llama_perf_context_print(ctx);
  
      llama_free(ctx);
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp

index 7e252ce093d759cd9f1de3194f2698438773dd05..b77b876ccc924c0fa47089a98a97c62f8aabfbad 100644 (file)
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -2,6 +2,7 @@
  #include "common.h"
  #include "console.h"
  #include "sampling.h"
+#include "log.h"
  #include "llama.h"
  
  #include <cassert>
@@ -55,7 +56,7 @@ static void write_logfile(
  
      const bool success = fs_create_directory_with_parents(params.logdir);
      if (!success) {
-        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
+        LOG_ERR("%s: warning: failed to create logdir %s, cannot write logfile\n",
                  __func__, params.logdir.c_str());
          return;
      }
@@ -64,7 +65,7 @@ static void write_logfile(
      FILE * logfile = fopen(logfile_path.c_str(), "w");
  
      if (logfile == NULL) {
-        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
+        LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
          return;
      }
  
@@ -93,7 +94,7 @@ static void sigint_handler(int signo) {
              is_interacting = true;
          } else {
              console::cleanup();
-            printf("\n");
+            LOG("\n");
              gpt_perf_print(*g_ctx, *g_smpl);
              write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
              _exit(130);
@@ -110,56 +111,51 @@ int main(int argc, char ** argv) {
          return 1;
      }
  
-    auto & sparams = params.sparams;
+    gpt_init();
  
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("infill", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
+    auto & sparams = params.sparams;
  
      console::init(params.simple_io, params.use_color);
      atexit([]() { console::cleanup(); });
  
      if (params.logits_all) {
-        printf("\n************\n");
-        printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
-        printf("************\n\n");
+        LOG_ERR("\n************\n");
+        LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
+        LOG_ERR("************\n\n");
  
          return 0;
      }
  
      if (params.embedding) {
-        printf("\n************\n");
-        printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
-        printf("************\n\n");
+        LOG_ERR("\n************\n");
+        LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
+        LOG_ERR("************\n\n");
  
          return 0;
      }
  
      if (params.n_ctx != 0 && params.n_ctx < 8) {
-        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+        LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__);
          params.n_ctx = 8;
      }
+
      if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
-        printf("\n************\n");
-        printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
-        printf("************\n\n");
+        LOG_ERR("\n************\n");
+        LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
+        LOG_ERR("************\n\n");
  
          return 0;
      }
  
      if (params.rope_freq_base != 0.0) {
-        LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
+        LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
      }
  
      if (params.rope_freq_scale != 0.0) {
-        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
+        LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
      }
  
-    print_build_info();
-
-    LOG("%s: llama backend init\n", __func__);
+    LOG_INF("%s: llama backend init\n", __func__);
      llama_backend_init();
      llama_numa_init(params.numa);
  
@@ -172,34 +168,32 @@ int main(int argc, char ** argv) {
      g_smpl = &smpl;
  
      // load the model and apply lora adapter, if any
-    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
+    LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
      llama_init_result llama_init = llama_init_from_gpt_params(params);
  
      model = llama_init.model;
      ctx = llama_init.context;
  
      if (model == NULL) {
-        LOG_TEE("%s: error: unable to load model\n", __func__);
+        LOG_ERR("%s: unable to load model\n", __func__);
          return 1;
      }
  
      const int n_ctx_train = llama_n_ctx_train(model);
      const int n_ctx = llama_n_ctx(ctx);
-    LOG("n_ctx: %d\n", n_ctx);
+    LOG_DBG("n_ctx: %d\n", n_ctx);
  
      if (n_ctx > n_ctx_train) {
-        LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, n_ctx);
+        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
      }
  
      // print system information
      {
-        LOG_TEE("\n");
-        LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
      }
      const bool add_bos = llama_add_bos_token(model);
      GGML_ASSERT(!llama_add_eos_token(model));
-    LOG("add_bos: %d\n", add_bos);
  
      std::vector<llama_token> embd_inp;
      std::vector<llama_token> embd_end;
@@ -224,18 +218,19 @@ int main(int argc, char ** argv) {
          embd_inp.push_back(middle_token);
      }
  
-    LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
-    LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
-    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+    LOG_DBG("add_bos: %d\n", add_bos);
+    LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str());
+    LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str());
+    LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
  
      // Should not run without any tokens
      if (embd_inp.empty()) {
          embd_inp.push_back(llama_token_bos(model));
-        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+        LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
      }
  
      if ((int) embd_inp.size() > n_ctx - 4) {
-        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+        LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
          return 1;
      }
  
@@ -244,9 +239,8 @@ int main(int argc, char ** argv) {
          params.n_keep = (int)embd_inp.size();
      }
  
-    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
-    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
-
+    LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str());
+    LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str());
  
      // enable interactive mode if interactive start is specified
      if (params.interactive_first) {
@@ -254,21 +248,21 @@ int main(int argc, char ** argv) {
      }
  
      if (params.verbose_prompt) {
-        LOG_TEE("\n");
-        LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-        LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+        LOG_INF("\n");
+        LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+        LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
          for (int i = 0; i < (int) embd_inp.size(); i++) {
-            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+            LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
          }
  
          if (params.n_keep > 0) {
-        LOG_TEE("%s: static prompt based on n_keep: '", __func__);
+        LOG_INF("%s: static prompt based on n_keep: '", __func__);
              for (int i = 0; i < params.n_keep; i++) {
-                LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+                LOG("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
              }
-            LOG_TEE("'\n");
+            LOG("'\n");
          }
-        LOG_TEE("\n");
+        LOG_INF("\n");
      }
  
      if (params.interactive) {
@@ -285,28 +279,30 @@ int main(int argc, char ** argv) {
          SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
  #endif
  
-        LOG_TEE("%s: interactive mode on.\n", __func__);
+        LOG_INF("%s: interactive mode on.\n", __func__);
  
          if (params.input_prefix_bos) {
-            LOG_TEE("Input prefix with BOS\n");
+            LOG_INF("Input prefix with BOS\n");
          }
  
          if (!params.input_prefix.empty()) {
-            LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
+            LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
          }
  
          if (!params.input_suffix.empty()) {
-            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
+            LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
          }
      }
      smpl = gpt_sampler_init(model, sparams);
  
-    LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl));
-    LOG_TEE("sampling: \n%s\n", sparams.print().c_str());
-    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
-    LOG_TEE("\n\n");
+    LOG_INF("sampler seed: %u\n",     gpt_sampler_get_seed(smpl));
+    LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
+    LOG_INF("sampler chain: %s\n",    gpt_sampler_print(smpl).c_str());
+
+    LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
  
-    LOG_TEE("\n#####  Infill mode  #####\n\n");
+    LOG("\n");
+    LOG("\n#####  Infill mode  #####\n\n");
      if (params.interactive) {
          const char *control_message;
          if (params.multiline_input) {
@@ -317,11 +313,11 @@ int main(int argc, char ** argv) {
                                " - To return control without starting a new line, end your input with '/'.\n"
                                " - If you want to submit another line, end your input with '\\'.\n";
          }
-        LOG_TEE("== Running in interactive mode. ==\n");
+        LOG("== Running in interactive mode. ==\n");
  #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-        LOG_TEE(       " - Press Ctrl+C to interject at any time.\n");
+        LOG(       " - Press Ctrl+C to interject at any time.\n");
  #endif
-        LOG_TEE(       "%s\n", control_message);
+        LOG(       "%s\n", control_message);
  
          is_interacting = params.interactive_first;
      }
@@ -354,9 +350,8 @@ int main(int argc, char ** argv) {
                  embd.resize(max_embd_size);
  
                  console::set_display(console::error);
-                printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+                LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
                  console::set_display(console::reset);
-                fflush(stdout);
              }
  
              // infinite text generation via context swapping
@@ -365,14 +360,14 @@ int main(int argc, char ** argv) {
              // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
              if (n_past + (int) embd.size() > n_ctx) {
                  if (params.n_predict == -2) {
-                    LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
+                    LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                      break;
                  }
  
                  const int n_left    = n_past - params.n_keep - 1;
                  const int n_discard = n_left/2;
  
-                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
+                LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                      n_past, n_left, n_ctx, params.n_keep, n_discard);
  
                  llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
@@ -380,9 +375,9 @@ int main(int argc, char ** argv) {
  
                  n_past -= n_discard;
  
-                LOG("after swap: n_past = %d\n", n_past);
+                LOG_DBG("after swap: n_past = %d\n", n_past);
  
-                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
+                LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
  
              }
  
@@ -394,16 +389,16 @@ int main(int argc, char ** argv) {
                      n_eval = params.n_batch;
                  }
  
-                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
+                LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
  
                  if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
-                    LOG_TEE("%s : failed to eval\n", __func__);
+                    LOG_ERR("%s : failed to eval\n", __func__);
                      return 1;
                  }
  
                  n_past += n_eval;
  
-                LOG("n_past = %d\n", n_past);
+                LOG_DBG("n_past = %d\n", n_past);
              }
  
          }
@@ -415,7 +410,7 @@ int main(int argc, char ** argv) {
  
              gpt_sampler_accept(smpl, id, true);
  
-            // LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str());
+            // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
  
              embd.push_back(id);
  
@@ -425,10 +420,10 @@ int main(int argc, char ** argv) {
              // decrement remaining sampling budget
              --n_remain;
  
-            LOG("n_remain: %d\n", n_remain);
+            LOG_DBG("n_remain: %d\n", n_remain);
          } else {
              // some user input remains from prompt or interaction, forward it to processing
-            LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
+            LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
              while ((int) embd_inp.size() > n_consumed) {
                  embd.push_back(embd_inp[n_consumed]);
  
@@ -447,7 +442,7 @@ int main(int argc, char ** argv) {
          if (input_echo) {
              for (auto id : embd) {
                  const std::string token_str = llama_token_to_piece(ctx, id);
-                printf("%s", token_str.c_str());
+                LOG("%s", token_str.c_str());
  
                  if (embd.size() > 1) {
                      input_tokens.push_back(id);
@@ -456,7 +451,6 @@ int main(int argc, char ** argv) {
                      output_ss << token_str;
                  }
              }
-            fflush(stdout);
          }
          // reset color to default if we there is no pending user input
          if (input_echo && (int) embd_inp.size() == n_consumed) {
@@ -469,10 +463,9 @@ int main(int argc, char ** argv) {
              if ((gpt_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
                  if (is_interacting && !params.interactive_first) {
                      // print an eot token
-                    printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
+                    LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
                  }
-                fflush(stdout);
-                printf("\n");
+                LOG("\n");
                  console::set_display(console::user_input);
                  std::string buffer;
                  std::string line;
@@ -528,35 +521,33 @@ int main(int argc, char ** argv) {
                  n_remain = params.n_predict;
                  n_past = 0;
                  n_consumed = 0;
-                // LOG_TEE("took new input\n");
                  is_interacting = false;
              }
              // deal with end of generation tokens in interactive mode
              else if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
-                LOG("found EOS token\n");
+                LOG_DBG("found EOS token\n");
  
                  if (params.interactive) {
  
                      is_interacting = true;
-                    printf("\n");
+                    LOG("\n");
                      console::set_display(console::user_input);
-                    fflush(stdout);
                 }
              }
  
              if (n_past > 0 && is_interacting && !params.interactive) {
-                LOG("waiting for user input\n");
+                LOG_DBG("waiting for user input\n");
  
                  if (params.input_prefix_bos) {
-                    LOG("adding input prefix BOS token\n");
+                    LOG_DBG("adding input prefix BOS token\n");
                      embd_inp.push_back(llama_token_bos(model));
                  }
  
                  std::string buffer;
                  if (!params.input_prefix.empty()) {
-                    LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
+                    LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
                      buffer += params.input_prefix;
-                    printf("%s", buffer.c_str());
+                    LOG("%s", buffer.c_str());
                  }
  
                  std::string line;
@@ -574,17 +565,17 @@ int main(int argc, char ** argv) {
                  if (buffer.length() > 1) {
                      // append input suffix if any
                      if (!params.input_suffix.empty()) {
-                        LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
+                        LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
                          buffer += params.input_suffix;
-                        printf("%s", params.input_suffix.c_str());
+                        LOG("%s", params.input_suffix.c_str());
                      }
  
-                    LOG("buffer: '%s'\n", buffer.c_str());
+                    LOG_DBG("buffer: '%s'\n", buffer.c_str());
  
                      const size_t original_size = embd_inp.size();
  
                      const auto line_inp = ::llama_tokenize(ctx, buffer, false);
-                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
+                    LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
  
                      embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
  
@@ -595,9 +586,9 @@ int main(int argc, char ** argv) {
                      }
  
                      n_remain -= line_inp.size();
-                    LOG("n_remain: %d\n", n_remain);
+                    LOG_DBG("n_remain: %d\n", n_remain);
                  } else {
-                    LOG("empty line, passing control back\n");
+                    LOG_DBG("empty line, passing control back\n");
                  }
  
                  input_echo = false; // do not echo this again
@@ -624,11 +615,10 @@ int main(int argc, char ** argv) {
          }
      }
      if (!params.interactive && n_remain <= 0) {
-        printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
-        fflush(stdout);
+        LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
      }
  
-    LOG_TEE("\n");
+    LOG("\n");
      gpt_perf_print(ctx, smpl);
      write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
  
@@ -638,9 +628,5 @@ int main(int argc, char ** argv) {
      gpt_sampler_free(smpl);
      llama_backend_free();
  
-#ifndef LOG_DISABLE_LOGS
-    LOG_TEE("Log end\n");
-#endif // LOG_DISABLE_LOGS
-
      return 0;
  }
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp

index 5dfb333d1be8c3f27b37f59ecd7bae7ac1155120..8aa7b0750cf20e19afeb8e9b35ad7b84f2acc95e 100644 (file)
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -3,7 +3,6 @@
  // I'll gradually clean and extend it
  // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
  #include "clip.h"
-#include "log.h"
  #include "ggml.h"
  #include "ggml-alloc.h"
  #include "ggml-backend.h"
@@ -40,6 +39,11 @@
  #include <cinttypes>
  #include <limits>
  
+#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+
  //#define CLIP_DEBUG_FUNCTIONS
  
  // RGB uint8 image
@@ -165,7 +169,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
  static int get_key_idx(const gguf_context * ctx, const char * key) {
      int i = gguf_find_key(ctx, key);
      if (i == -1) {
-        LOG_TEE("key %s not found in file\n", key);
+        LOG_ERR("key %s not found in file\n", key);
          throw std::runtime_error(format("Missing required key: %s", key));
      }
  
@@ -270,7 +274,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
  
  static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
      size_t tensor_size = ggml_nbytes(tensor);
-    LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
+    LOG_INF("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
              prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
              tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
  }
@@ -288,7 +292,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
  static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
      std::ofstream file(filename, std::ios::binary);
      if (!file.is_open()) {
-        LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
+        LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
          return;
      }
  
@@ -307,7 +311,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
  static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
      std::ofstream file(filename, std::ios::binary);
      if (!file.is_open()) {
-        LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
+        LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
          return;
      }
  
@@ -568,7 +572,7 @@ struct clip_ctx {
  
  static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
      if (!ctx->has_vision_encoder) {
-        LOG_TEE("This gguf file seems to have no vision encoder\n");
+        LOG_ERR("This gguf file seems to have no vision encoder\n");
          return nullptr;
      }
  
@@ -582,7 +586,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
          if (load_image_size == nullptr) {
              load_image_size = clip_image_size_init();
          }
-        LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
+        LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
          image_size_width  = load_image_size->width;
          image_size_height = load_image_size->height;
          if (is_inf) {
@@ -1047,21 +1051,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
          const int idx_name = gguf_find_key(ctx, KEY_NAME);
          if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
              const std::string name = gguf_get_val_str(ctx, idx_name);
-            LOG_TEE("%s: model name:   %s\n", __func__, name.c_str());
+            LOG_INF("%s: model name:   %s\n", __func__, name.c_str());
          }
-        LOG_TEE("%s: description:  %s\n", __func__, description.c_str());
-        LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
-        LOG_TEE("%s: alignment:    %zu\n", __func__, gguf_get_alignment(ctx));
-        LOG_TEE("%s: n_tensors:    %d\n", __func__, n_tensors);
-        LOG_TEE("%s: n_kv:         %d\n", __func__, n_kv);
-        LOG_TEE("%s: ftype:        %s\n", __func__, ftype_str.c_str());
-        LOG_TEE("\n");
+        LOG_INF("%s: description:  %s\n", __func__, description.c_str());
+        LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
+        LOG_INF("%s: alignment:    %zu\n", __func__, gguf_get_alignment(ctx));
+        LOG_INF("%s: n_tensors:    %d\n", __func__, n_tensors);
+        LOG_INF("%s: n_kv:         %d\n", __func__, n_kv);
+        LOG_INF("%s: ftype:        %s\n", __func__, ftype_str.c_str());
+        LOG_INF("\n");
      }
      const int n_tensors = gguf_get_n_tensors(ctx);
  
      // kv
      const int n_kv = gguf_get_n_kv(ctx);
-    LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
+    LOG_INF("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
          __func__, n_kv, n_tensors, fname);
      {
          std::map<enum ggml_type, uint32_t> n_type;
@@ -1072,7 +1076,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
              n_type[type]++;
          }
  
-        LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
+        LOG_INF("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
          for (int i = 0; i < n_kv; i++) {
              const char * name           = gguf_get_key(ctx, i);
              const enum gguf_type type   = gguf_get_kv_type(ctx, i);
@@ -1088,7 +1092,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
              }
              replace_all(value, "\n", "\\n");
  
-            LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
+            LOG_INF("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
          }
  
          // print type counts
@@ -1097,7 +1101,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                  continue;
              }
  
-            LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
+            LOG_INF("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
          }
      }
  
@@ -1112,7 +1116,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
              size_t tensor_size = ggml_nbytes(cur);
              model_size += tensor_size;
              if (verbosity >= 3) {
-                LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
+                LOG_INF("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
                         __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
              }
          }
@@ -1139,27 +1143,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
  
  #ifdef GGML_USE_CUDA
      new_clip->backend = ggml_backend_cuda_init(0);
-    LOG_TEE("%s: CLIP using CUDA backend\n", __func__);
+    LOG_INF("%s: CLIP using CUDA backend\n", __func__);
  #endif
  
  #ifdef GGML_USE_METAL
      new_clip->backend = ggml_backend_metal_init();
-    LOG_TEE("%s: CLIP using Metal backend\n", __func__);
+    LOG_INF("%s: CLIP using Metal backend\n", __func__);
  #endif
  
  #ifdef GGML_USE_CANN
      new_clip->backend = ggml_backend_cann_init(0);
-    LOG_TEE("%s: CLIP using CANN backend\n", __func__);
+    LOG_INF("%s: CLIP using CANN backend\n", __func__);
  #endif
  
  #ifdef GGML_USE_VULKAN
      new_clip->backend = ggml_backend_vk_init(0);
-    LOG_TEE("%s: CLIP using Vulkan backend\n", __func__);
+    LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
  #endif
  
      if (!new_clip->backend) {
          new_clip->backend = ggml_backend_cpu_init();
-        LOG_TEE("%s: CLIP using CPU backend\n", __func__);
+        LOG_INF("%s: CLIP using CPU backend\n", __func__);
      }
  
      // model size and capabilities
@@ -1194,16 +1198,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
          new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
  
          if (verbosity >= 1) {
-            LOG_TEE("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
-            LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
-            LOG_TEE("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
-            LOG_TEE("%s: minicpmv_projector:  %d\n", __func__, new_clip->has_minicpmv_projector);
-            LOG_TEE("%s: model size:     %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
-            LOG_TEE("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
+            LOG_INF("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
+            LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
+            LOG_INF("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
+            LOG_INF("%s: minicpmv_projector:  %d\n", __func__, new_clip->has_minicpmv_projector);
+            LOG_INF("%s: model size:     %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
+            LOG_INF("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
          }
      }
  
-    LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
+    LOG_INF("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
  
      // load tensors
      {
@@ -1216,7 +1220,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
  
          new_clip->ctx_data = ggml_init(params);
          if (!new_clip->ctx_data) {
-            LOG_TEE("%s: ggml_init() failed\n", __func__);
+            LOG_ERR("%s: ggml_init() failed\n", __func__);
              clip_free(new_clip);
              gguf_free(ctx);
              return nullptr;
@@ -1224,7 +1228,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
  
          auto fin = std::ifstream(fname, std::ios::binary);
          if (!fin) {
-            LOG_TEE("cannot open model file for loading tensors\n");
+            LOG_ERR("cannot open model file for loading tensors\n");
              clip_free(new_clip);
              gguf_free(ctx);
              return nullptr;
@@ -1246,7 +1250,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
              const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
              fin.seekg(offset, std::ios::beg);
              if (!fin) {
-                LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name);
+                LOG_ERR("%s: failed to seek for tensor %s\n", __func__, name);
                  clip_free(new_clip);
                  gguf_free(ctx);
                  return nullptr;
@@ -1317,23 +1321,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
          }
  
          if (verbosity >= 2) {
-            LOG_TEE("\n%s: vision model hparams\n", __func__);
-            LOG_TEE("image_size         %d\n", hparams.image_size);
-            LOG_TEE("patch_size         %d\n", hparams.patch_size);
-            LOG_TEE("v_hidden_size      %d\n", hparams.hidden_size);
-            LOG_TEE("v_n_intermediate   %d\n", hparams.n_intermediate);
-            LOG_TEE("v_projection_dim   %d\n", hparams.projection_dim);
-            LOG_TEE("v_n_head           %d\n", hparams.n_head);
-            LOG_TEE("v_n_layer          %d\n", hparams.n_layer);
-            LOG_TEE("v_eps              %f\n", hparams.eps);
-            LOG_TEE("v_image_mean       %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
-            LOG_TEE("v_image_std        %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
-            LOG_TEE("v_image_grid_pinpoints: ");
+            LOG_INF("\n%s: vision model hparams\n", __func__);
+            LOG_INF("image_size         %d\n", hparams.image_size);
+            LOG_INF("patch_size         %d\n", hparams.patch_size);
+            LOG_INF("v_hidden_size      %d\n", hparams.hidden_size);
+            LOG_INF("v_n_intermediate   %d\n", hparams.n_intermediate);
+            LOG_INF("v_projection_dim   %d\n", hparams.projection_dim);
+            LOG_INF("v_n_head           %d\n", hparams.n_head);
+            LOG_INF("v_n_layer          %d\n", hparams.n_layer);
+            LOG_INF("v_eps              %f\n", hparams.eps);
+            LOG_INF("v_image_mean       %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
+            LOG_INF("v_image_std        %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
+            LOG_INF("v_image_grid_pinpoints: ");
              for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
-                LOG_TEE("%d ", hparams.image_grid_pinpoints[i]);
+                LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
              }
-            LOG_TEE("\n");
-            LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
+            LOG_INF("\n");
+            LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
  
          }
  
@@ -1371,7 +1375,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
              vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
              vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
          } catch(const std::exception& /*e*/) {
-            LOG_TEE("%s: failed to load vision model tensors\n", __func__);
+            LOG_ERR("%s: failed to load vision model tensors\n", __func__);
          }
  
          // LLaVA projection
@@ -1400,7 +1404,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
              } catch (std::runtime_error & /*e*/) { }
              try {
                  vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
-                // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
+                // LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__);
              } catch (std::runtime_error & /*e*/) { }
          } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
              // MobileVLM projection
@@ -1501,7 +1505,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
          ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
          ggml_gallocr_reserve(new_clip->compute_alloc, gf);
          size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
-        LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
+        LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
      }
  
      return new_clip;
@@ -1552,7 +1556,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
      int nx, ny, nc;
      auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
      if (!data) {
-        LOG_TEE("%s: failed to load image '%s'\n", __func__, fname);
+        LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
          return false;
      }
      build_clip_img_from_data(data, nx, ny, img);
@@ -1564,7 +1568,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
      int nx, ny, nc;
      auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
      if (!data) {
-        LOG_TEE("%s: failed to decode image bytes\n", __func__);
+        LOG_ERR("%s: failed to decode image bytes\n", __func__);
          return false;
      }
      build_clip_img_from_data(data, nx, ny, img);
@@ -1754,7 +1758,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
          int downscaled_height = static_cast<int>(original_height * scale);
          int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
          int wasted_resolution = (width * height) - effective_resolution;
-        // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+        // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
          if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
              max_effective_resolution = effective_resolution;
              min_wasted_resolution = wasted_resolution;
@@ -1872,7 +1876,7 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
      const int multiple = fmin(ceil(ratio), max_slice_nums);
  
      std::vector<std::vector<clip_image_u8 *>> images;
-    LOG_TEE("%s: multiple %d\n", __func__, multiple);
+    LOG_INF("%s: multiple %d\n", __func__, multiple);
      images.push_back(std::vector<clip_image_u8 *>());
  
      if (multiple <= 1) {
@@ -1887,17 +1891,17 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
          clip_image_u8 * source_image = clip_image_u8_init();
          bicubic_resize(*img, *source_image, best_size.first, best_size.second);
          // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
-        LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
+        LOG_INF("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
          images[images.size()-1].push_back(source_image);
  
          std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
-        LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
+        LOG_INF("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
  
          auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
          clip_image_u8 * refine_image = clip_image_u8_init();
          bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
  
-        LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
+        LOG_INF("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
  
          // split_to_patches
          int width = refine_image->nx;
@@ -1954,7 +1958,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
          int idx = 0;
          for (size_t i = 0; i < imgs.size(); ++i) {
              for (size_t j = 0; j < imgs[i].size(); ++j) {
-                LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
+                LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
                  clip_image_f32 * res = clip_image_f32_init();
                  normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
                  res_imgs->data[idx++] = *res;
@@ -1966,7 +1970,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
  
      bool pad_to_square = true;
      if (!ctx->has_vision_encoder) {
-        LOG_TEE("This gguf file seems to have no vision encoder\n");
+        LOG_ERR("This gguf file seems to have no vision encoder\n");
          return false;
      }
      auto & params = ctx->vision_model.hparams;
@@ -2043,7 +2047,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
              }
  
              for (size_t i = 0; i < patches.size(); i++) {
-                // LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
+                // LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
                  clip_image_u8_free(patches[i]);
              }
  
@@ -2279,7 +2283,7 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
  
  bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
      if (!ctx->has_vision_encoder) {
-        LOG_TEE("This gguf file seems to have no vision encoder\n");
+        LOG_ERR("This gguf file seems to have no vision encoder\n");
          return false;
      }
  
@@ -2291,7 +2295,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
  
  bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
      if (!ctx->has_vision_encoder) {
-        LOG_TEE("This gguf file seems to have no vision encoder\n");
+        LOG_ERR("This gguf file seems to have no vision encoder\n");
          return false;
      }
  
@@ -2521,7 +2525,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
              new_type = type;
              if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
                  new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
-                // LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
+                // LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
              }
              const size_t n_elms = ggml_nelements(cur);
              float * f32_data;
@@ -2540,7 +2544,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
                  f32_data = (float *)conv_buf.data();
                  break;
              default:
-                LOG_TEE("Please use an input file in f32 or f16\n");
+                LOG_ERR("Please use an input file in f32 or f16\n");
                  gguf_free(ctx_out);
                  return false;
              }
@@ -2567,7 +2571,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
              fout.put(0);
          }
  
-        LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
+        LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
                 orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
      }
  
@@ -2583,8 +2587,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
      gguf_free(ctx_out);
  
      {
-        LOG_TEE("%s: original  size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
-        LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
+        LOG_INF("%s: original  size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
+        LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
      }
  
      return true;
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp

index 12fe7345ff76c1f4d76512f92c4e8959ab2d4303..8f437863f6d77381dd7625a18160373f98834705 100644 (file)
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -10,6 +10,7 @@
  
  #include <cstdio>
  #include <cstdlib>
+#include <cstring>
  #include <vector>
  
  static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
@@ -20,7 +21,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
              n_eval = n_batch;
          }
          if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
-            LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
+            LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
              return false;
          }
          *n_past += n_eval;
@@ -75,7 +76,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
      size_t img_base64_str_start, img_base64_str_end;
      find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
      if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
-        LOG_TEE("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
+        LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
          return NULL;
      }
  
@@ -89,7 +90,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
  
      auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
      if (!embed) {
-        LOG_TEE("%s: could not load image from base64 string.\n", __func__);
+        LOG_ERR("%s: could not load image from base64 string.\n", __func__);
          return NULL;
      }
  
@@ -114,9 +115,9 @@ struct llava_context {
  };
  
  static void print_usage(int, char ** argv) {
-    LOG_TEE("\n example usage:\n");
-    LOG_TEE("\n     %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
+    LOG("\n example usage:\n");
+    LOG("\n     %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
  }
  
  static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
@@ -126,11 +127,11 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
      auto prompt = params->prompt;
      if (prompt_contains_image(prompt)) {
          if (!params->image.empty()) {
-            LOG_TEE("using base64 encoded image instead of command line image path\n");
+            LOG_INF("using base64 encoded image instead of command line image path\n");
          }
          embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
          if (!embed) {
-            LOG_TEE("%s: can't load image from prompt\n", __func__);
+            LOG_ERR("%s: can't load image from prompt\n", __func__);
              return NULL;
          }
          params->prompt = remove_image_from_prompt(prompt);
@@ -156,18 +157,18 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
          // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
          system_prompt = prompt.substr(0, image_pos);
          user_prompt = prompt.substr(image_pos + std::string("<image>").length());
-        LOG_TEE("system_prompt: %s\n", system_prompt.c_str());
+        LOG_INF("system_prompt: %s\n", system_prompt.c_str());
          if (params->verbose_prompt) {
              auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
              for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
              }
          }
-        LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
+        LOG_INF("user_prompt: %s\n", user_prompt.c_str());
          if (params->verbose_prompt) {
              auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
              for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
              }
          }
      } else {
@@ -177,7 +178,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
          if (params->verbose_prompt) {
              auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
              for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
              }
          }
      }
@@ -188,11 +189,11 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
  
      // generate the response
  
-    LOG_TEE("\n");
+    LOG("\n");
  
      struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
      if (!smpl) {
-        fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
+        LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
          exit(1);
      }
  
@@ -202,7 +203,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
          response += tmp;
          if (strcmp(tmp, "</s>") == 0) break;
          if (strstr(tmp, "###")) break; // Yi-VL behavior
-        printf("%s", tmp);
+        LOG("%s", tmp);
          if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
          if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
          if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
@@ -211,7 +212,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
      }
  
      gpt_sampler_free(smpl);
-    printf("\n");
+    LOG("\n");
  }
  
  static struct llama_model * llava_init(gpt_params * params) {
@@ -222,7 +223,7 @@ static struct llama_model * llava_init(gpt_params * params) {
  
      llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
      if (model == NULL) {
-        LOG_TEE("%s: error: unable to load model\n" , __func__);
+        LOG_ERR("%s: unable to load model\n" , __func__);
          return NULL;
      }
      return model;
@@ -245,11 +246,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
      llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
  
      if (ctx_llama == NULL) {
-        LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
+        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
          return NULL;
      }
  
-    auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
+    auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
  
      ctx_llava->ctx_llama = ctx_llama;
      ctx_llava->ctx_clip = ctx_clip;
@@ -268,12 +269,6 @@ static void llava_free(struct llava_context * ctx_llava) {
      llama_backend_free();
  }
  
-static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
-    (void) level;
-    (void) user_data;
-    LOG_TEE("%s", text);
-}
-
  int main(int argc, char ** argv) {
      ggml_time_init();
  
@@ -283,27 +278,23 @@ int main(int argc, char ** argv) {
          return 1;
      }
  
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("llava", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-    llama_log_set(llama_log_callback_logTee, nullptr);
-#endif // LOG_DISABLE_LOGS
+    gpt_init();
  
      if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
          print_usage(argc, argv);
          return 1;
      }
-    auto model = llava_init(&params);
+
+    auto * model = llava_init(&params);
      if (model == NULL) {
          fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
          return 1;
      }
  
      if (prompt_contains_image(params.prompt)) {
-        auto ctx_llava = llava_init_context(&params, model);
+        auto * ctx_llava = llava_init_context(&params, model);
  
-        auto image_embed = load_image(ctx_llava, &params, "");
+        auto * image_embed = load_image(ctx_llava, &params, "");
  
          // process the prompt
          process_prompt(ctx_llava, image_embed, &params, params.prompt);
@@ -314,11 +305,11 @@ int main(int argc, char ** argv) {
          llava_free(ctx_llava);
      } else {
          for (auto & image : params.image) {
-            auto ctx_llava = llava_init_context(&params, model);
+            auto * ctx_llava = llava_init_context(&params, model);
  
-            auto image_embed = load_image(ctx_llava, &params, image);
+            auto * image_embed = load_image(ctx_llava, &params, image);
              if (!image_embed) {
-                std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
+                LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
                  return 1;
              }
  
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp

index e162586ed88d23368bc4cc2722dfe56a048c7424..8558c6bdcae0fa00e8e5a80c01128684f642cdc2 100644 (file)
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -1,13 +1,23 @@
  #include "clip.h"
-#include "common.h"
-#include "llama.h"
  #include "llava.h"
-#include "base64.hpp"
  
+#include "llama.h"
+
+#include <algorithm>
+#include <cerrno>
  #include <cstdio>
  #include <cstdlib>
+#include <cstring>
+#include <limits>
  #include <vector>
-#include <numeric>
+
+#define die(msg)          do { fputs("error: " msg "\n", stderr);                exit(1); } while (0)
+#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
+
+#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
  
  // RGB uint8 image
  struct clip_image_u8 {
@@ -54,7 +64,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int>& ori
          int downscaled_height = static_cast<int>(original_height * scale);
          int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
          int wasted_resolution = (width * height) - effective_resolution;
-        // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+        // LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
          if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
              max_effective_resolution = effective_resolution;
              min_wasted_resolution = wasted_resolution;
@@ -236,7 +246,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
      img_res_v.size = 0;
      img_res_v.data = nullptr;
      if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
-        LOG_TEE("%s: unable to preprocess image\n", __func__);
+        LOG_ERR("%s: unable to preprocess image\n", __func__);
          delete[] img_res_v.data;
          return false;
      }
@@ -265,14 +275,14 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
                  encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
              }
              if (!encoded) {
-                LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
+                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
                  return false;
              }
              const int64_t t_img_enc_steop_batch_us = ggml_time_us();
-            LOG_TEE("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
+            LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
          }
          const int64_t t_img_enc_batch_us = ggml_time_us();
-        LOG_TEE("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+        LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
  
          int n_img_pos_out = 0;
          for (size_t i = 0; i < image_embd_v.size(); i++) {
@@ -287,7 +297,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
          load_image_size->width = img->nx;
          load_image_size->height = img->ny;
          clip_add_load_image_size(ctx_clip, load_image_size);
-        LOG_TEE("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
+        LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
      }
      else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
          // flat / default llava-1.5 type embedding
@@ -295,7 +305,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
          bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
          delete[] img_res_v.data;
          if (!encoded) {
-            LOG_TEE("Unable to encode image\n");
+            LOG_ERR("Unable to encode image\n");
  
              return false;
          }
@@ -309,12 +319,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
              image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
              const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
              if (!encoded) {
-                LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
+                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
                  return false;
              }
          }
          const int64_t t_img_enc_batch_us = ggml_time_us();
-        LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+        LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
  
          const int32_t * image_grid = clip_image_grid(ctx_clip);
  
@@ -347,12 +357,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
          // clip_image_save_to_bmp(*tmp, "image_feature.bmp");
      }
  
-    LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
+    LOG_INF("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
  
      const int64_t t_img_enc_end_us = ggml_time_us();
      float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
  
-    LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
+    LOG_INF("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
  
      return true;
  }
@@ -362,7 +372,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
      int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
      auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
      if (n_image_embd != n_llama_embd) {
-        LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
+        LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
          return false;
      }
      return true;
@@ -375,13 +385,13 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
      }
      float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
      if (!image_embd) {
-        LOG_TEE("Unable to allocate memory for image embeddings\n");
+        LOG_ERR("Unable to allocate memory for image embeddings\n");
          return false;
      }
  
      int n_img_pos;
      if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
-        LOG_TEE("%s: cannot encode image, aborting\n", __func__);
+        LOG_ERR("%s: cannot encode image, aborting\n", __func__);
          free(image_embd);
          return false;
      }
@@ -401,7 +411,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
          }
          llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
          if (llama_decode(ctx_llama, batch)) {
-            LOG_TEE("%s : failed to eval\n", __func__);
+            LOG_ERR("%s : failed to eval\n", __func__);
              return false;
          }
          *n_past += n_eval;
@@ -413,7 +423,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
      clip_image_u8 * img = clip_image_u8_init();
      if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
          clip_image_u8_free(img);
-        LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
+        LOG_ERR("%s: can't load image from bytes, is it a valid image?", __func__);
          return NULL;
      }
  
@@ -422,7 +432,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
      bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
      if (!image_embed_result) {
          clip_image_u8_free(img);
-        LOG_TEE("%s: coulnd't embed the image\n", __func__);
+        LOG_ERR("%s: coulnd't embed the image\n", __func__);
          return NULL;
      }
  
@@ -436,7 +446,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
  static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
      auto file = fopen(path, "rb");
      if (file == NULL) {
-        LOG_TEE("%s: can't read file %s\n", __func__, path);
+        LOG_ERR("%s: can't read file %s\n", __func__, path);
          return false;
      }
  
@@ -446,7 +456,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
  
      auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
      if (buffer == NULL) {
-        LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
+        LOG_ERR("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
          perror("Memory allocation error");
          fclose(file);
          return false;
@@ -471,7 +481,7 @@ struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx
      long image_bytes_length;
      auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
      if (!loaded) {
-        LOG_TEE("%s: failed to load %s\n", __func__, image_path);
+        LOG_ERR("%s: failed to load %s\n", __func__, image_path);
          return NULL;
      }
  
diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp

index 3ac455e69c8008e4c406f0c67198e562dbcf3e6d..c5156c35b029c13490edad30e14a4f312985166f 100644 (file)
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -7,9 +7,12 @@
  #include "llama.h"
  #include "ggml.h"
  
+#include <algorithm>
  #include <cstdio>
  #include <cstdlib>
+#include <cstring>
  #include <vector>
+#include <iostream> // TODO: remove me
  
  struct llava_context {
      struct clip_ctx * ctx_clip = NULL;
@@ -18,14 +21,8 @@ struct llava_context {
  };
  
  static void show_additional_info(int /*argc*/, char ** argv) {
-    LOG_TEE("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    LOG_TEE("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
-}
-
-static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
-    (void) level;
-    (void) user_data;
-    LOG_TEE("%s", text);
+    LOG("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
  }
  
  static struct llama_model * llava_init(gpt_params * params) {
@@ -36,7 +33,7 @@ static struct llama_model * llava_init(gpt_params * params) {
  
      llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
      if (model == NULL) {
-        LOG_TEE("%s: error: unable to load model\n" , __func__);
+        LOG_ERR("%s: unable to load model\n" , __func__);
          return NULL;
      }
      return model;
@@ -51,7 +48,7 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
      llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
      if (params->n_ctx < 2048) {
          // warn user here, "Image processing requires at least 2048 context, setting context to 2048"
-        LOG_TEE("%s: warn: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
+        LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
          ctx_params.n_ctx = 2048;
      } else {
          ctx_params.n_ctx = params->n_ctx;
@@ -60,11 +57,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
      llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
  
      if (ctx_llama == NULL) {
-        LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
+        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
          return NULL;
      }
  
-    auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
+    auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
  
      ctx_llava->ctx_llama = ctx_llama;
      ctx_llava->model = model;
@@ -89,7 +86,7 @@ static struct clip_ctx * clip_init_context(gpt_params * params) {
      if (prompt.empty()) {
          prompt = "describe the image in detail.";
      }
-    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
+    auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
      return ctx_clip;
  }
  
@@ -101,7 +98,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
              n_eval = n_batch;
          }
          if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
-            LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
+            LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
              return false;
          }
          *n_past += n_eval;
@@ -125,7 +122,7 @@ static void process_eval_image_embed(struct llava_context * ctx_llava, const str
      float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
      std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
  
-    auto slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
+    auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
      slice_embed->embed = image_embed;
      slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
      llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
@@ -143,7 +140,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
      else if (has_minicpmv_projector == 3) {
          system_prompt = "<|im_start|>user\n";
      }
-    LOG_TEE("%s: image token past: %d\n", __func__, n_past);
+    LOG_INF("%s: image token past: %d\n", __func__, n_past);
      eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
      process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
      eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
@@ -162,7 +159,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
          }
          eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
      }
-    LOG_TEE("%s: image token past: %d\n", __func__, n_past);
+    LOG_INF("%s: image token past: %d\n", __func__, n_past);
  }
  
  static const char * sample(struct gpt_sampler * smpl,
@@ -181,42 +178,42 @@ static const char * sample(struct gpt_sampler * smpl,
  }
  
  static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
-    auto ctx_clip = clip_init_context(params);
-    auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
+    auto * ctx_clip = clip_init_context(params);
+    auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
      if (!embeds) {
-        std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
+        LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str());
          return NULL;
      }
  
      // process the prompt
      if (params->prompt.empty() && params->interactive == false) {
-        LOG_TEE("prompt should be given or interactive mode should be on");
+        LOG_ERR("prompt should be given or interactive mode should be on");
          return NULL;
      }
  
-    auto model = llava_init(params);
+    auto * model = llava_init(params);
      if (model == NULL) {
          fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
          return NULL;
      }
      const int64_t t_llava_init_start_us = ggml_time_us();
-    auto ctx_llava = llava_init_context(params, model);
+    auto * ctx_llava = llava_init_context(params, model);
      ctx_llava->ctx_clip = ctx_clip;
      const int64_t t_llava_init_end_us = ggml_time_us();
      float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
-    LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
+    LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
  
      const int64_t t_process_image_start_us = ggml_time_us();
      process_image(ctx_llava, embeds, params, n_past);
      const int64_t t_process_image_end_us = ggml_time_us();
      float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
-    LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
+    LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
  
      llava_image_embed_free(embeds);
      return ctx_llava;
  }
  
-static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
+static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, const std::string & prompt, int & n_past, bool is_first = false){
      std::string user_prompt = prompt;
      int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
      if (!is_first) {
@@ -238,7 +235,7 @@ static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_par
  
      // generate the response
  
-    LOG_TEE("\n");
+    LOG_INF("\n");
  
      struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
      return smpl;
@@ -259,12 +256,7 @@ int main(int argc, char ** argv) {
          return 1;
      }
  
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("llava", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-    llama_log_set(llama_log_callback_logTee, nullptr);
-#endif // LOG_DISABLE_LOGS
+    gpt_init();
  
      if (params.mmproj.empty() || (params.image.empty())) {
          show_additional_info(argc, argv);
@@ -273,21 +265,23 @@ int main(int argc, char ** argv) {
  
      for (auto & image : params.image) {
          int n_past = 0;
-        auto ctx_llava = minicpmv_init(&params, image, n_past);
+        auto * ctx_llava = minicpmv_init(&params, image, n_past);
  
          if (!params.prompt.empty()) {
-            LOG_TEE("<user>%s\n", params.prompt.c_str());
-            LOG_TEE("<assistant>");
-            auto smpl = llama_init(ctx_llava, &params, params.prompt.c_str(), n_past, true);
+            LOG("<user>%s\n", params.prompt.c_str());
+            LOG("<assistant>");
+            auto * smpl = llama_init(ctx_llava, &params, params.prompt, n_past, true);
              const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
-            std::string response = "";
+            std::string response;
              bool have_tmp = false;
              for (int i = 0; i < max_tgt_len; i++) {
-                auto tmp = llama_loop(ctx_llava, smpl, n_past);
+                const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
                  response += tmp;
                  if (strcmp(tmp, "</s>") == 0){
-                    if(!have_tmp)continue;
-                    else break;
+                    if (!have_tmp) {
+                        continue;
+                    }
+                    break;
                  }
                  if (strstr(tmp, "###")) break; // Yi-VL behavior
                  have_tmp = true;
@@ -299,15 +293,15 @@ int main(int argc, char ** argv) {
              gpt_sampler_free(smpl);
          }else {
              while (true) {
-                LOG_TEE("<user>");
+                LOG("<user>");
                  std::string prompt;
                  std::getline(std::cin, prompt);
-                LOG_TEE("<assistant>");
-                auto smpl = llama_init(ctx_llava, &params, prompt, n_past, true);
+                LOG("<assistant>");
+                auto * smpl = llama_init(ctx_llava, &params, prompt, n_past, true);
                  const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
-                std::string response = "";
+                std::string response;
                  for (int i = 0; i < max_tgt_len; i++) {
-                    auto tmp = llama_loop(ctx_llava, smpl, n_past);
+                    const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
                      response += tmp;
                      if (strcmp(tmp, "</s>") == 0) break;
                      if (strstr(tmp, "###")) break; // Yi-VL behavior
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp

index de8b792f237144f038999748aa1035c3a0fa9d83..49870b4a4e724641b05e5dc984da7ece2e25186c 100644 (file)
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -1,6 +1,7 @@
  #include "arg.h"
  #include "common.h"
  #include "sampling.h"
+#include "log.h"
  #include "llama.h"
  
  #include <cstdio>
@@ -42,18 +43,14 @@ int main(int argc, char ** argv) {
          return 1;
      }
  
+    gpt_init();
+
      const int W = 15; // lookahead window
      const int N = 5;  // n-gram size
      const int G = 15; // max verification n-grams
  
      const bool dump_kv_cache = params.dump_kv_cache;
  
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("lookahead", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
-
      // init llama.cpp
      llama_backend_init();
      llama_numa_init(params.numa);
@@ -75,14 +72,14 @@ int main(int argc, char ** argv) {
      const int max_tokens_list_size = max_context_size - 4;
  
      if ((int) inp.size() > max_tokens_list_size) {
-        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
+        LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
          return 1;
      }
  
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
  
      for (auto id : inp) {
-        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
+        LOG("%s", llama_token_to_piece(ctx, id).c_str());
      }
  
      fflush(stderr);
@@ -166,7 +163,7 @@ int main(int argc, char ** argv) {
          {
              const std::string token_str = llama_token_to_piece(ctx, id);
  
-            printf("%s", token_str.c_str());
+            LOG("%s", token_str.c_str());
              fflush(stdout);
          }
      }
@@ -256,7 +253,7 @@ int main(int argc, char ** argv) {
          }
  
          if (llama_decode(ctx, batch) != 0) {
-            fprintf(stderr, "\n\n%s: error: llama_decode failed - increase KV cache size\n", __func__);
+            LOG_ERR("\n\n%s: llama_decode failed - increase KV cache size\n", __func__);
              return 1;
          }
  
@@ -293,10 +290,10 @@ int main(int argc, char ** argv) {
                  const std::string token_str = llama_token_to_piece(ctx, id);
  
                  if (v == 0) {
-                    printf("%s", token_str.c_str());
+                    LOG("%s", token_str.c_str());
                  } else {
                      // print light cyan
-                    printf("\033[0;96m%s\033[0m", token_str.c_str());
+                    LOG("\033[0;96m%s\033[0m", token_str.c_str());
                  }
                  fflush(stdout);
  
@@ -330,21 +327,21 @@ int main(int argc, char ** argv) {
              // print known n-grams starting with token id (debug)
              if (0 && v == 0) {
                  if (ngrams_observed.cnt[id] > 0) {
-                    printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
+                    LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
                  }
  
                  for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
-                    printf("   - ngram %2d: ", i);
+                    LOG("   - ngram %2d: ", i);
  
                      const int idx = id*(N - 1)*G + i*(N - 1);
  
                      for (int j = 0; j < N - 1; j++) {
                          const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
  
-                        printf("%s", token_str.c_str());
+                        LOG("%s", token_str.c_str());
                      }
  
-                    printf("\n");
+                    LOG("\n");
                  }
              }
  
@@ -455,20 +452,20 @@ int main(int argc, char ** argv) {
  
      auto t_dec_end = ggml_time_us();
  
-    LOG_TEE("\n\n");
+    LOG("\n\n");
  
-    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
-    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
+    LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
+    LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
  
-    LOG_TEE("\n");
-    LOG_TEE("W = %2d\n", W);
-    LOG_TEE("N = %2d\n", N);
-    LOG_TEE("G = %2d\n", G);
-    LOG_TEE("\n");
-    LOG_TEE("n_predict = %d\n", n_predict);
-    LOG_TEE("n_accept  = %d\n", n_accept);
+    LOG_INF("\n");
+    LOG_INF("W = %2d\n", W);
+    LOG_INF("N = %2d\n", N);
+    LOG_INF("G = %2d\n", G);
+    LOG_INF("\n");
+    LOG_INF("n_predict = %d\n", n_predict);
+    LOG_INF("n_accept  = %d\n", n_accept);
  
-    LOG_TEE("\n");
+    LOG_INF("\n");
      gpt_perf_print(ctx, smpl);
  
      gpt_sampler_free(smpl);
@@ -482,7 +479,7 @@ int main(int argc, char ** argv) {
  
      llama_backend_free();
  
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
  
      return 0;
  }
diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp

index f299d68a93ed95e54d613b367a64d791bb0796d9..6d1e1ceb958156c1c7c275f6973cee910e92c48a 100644 (file)
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -5,13 +5,12 @@
  #include "llama.h"
  #include "ggml.h"
  
-#include <cmath>
  #include <cstdint>
  #include <cstdio>
+#include <cinttypes>
  #include <fstream>
  #include <string>
  #include <vector>
-#include <unordered_map>
  
  int main(int argc, char ** argv){
      gpt_params params;
@@ -20,6 +19,8 @@ int main(int argc, char ** argv){
          return 1;
      }
  
+    gpt_init();
+
      const int n_draft = params.n_draft;
  
      // init llama.cpp
@@ -49,7 +50,7 @@ int main(int argc, char ** argv){
              try {
                  ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
              } catch (std::ifstream::failure const &) {
-                fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
+                LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
                  exit(1);
              }
          }
@@ -128,7 +129,7 @@ int main(int argc, char ** argv){
              const int64_t eta_min  = eta_ms / (60*1000);
              const int64_t eta_s    = (eta_ms - 60*1000*eta_min) / 1000;
  
-            LOG_TEE("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
+            LOG_INF("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
          }
  
          // After each chunk, update the dynamic ngram cache with the context ngram cache:
@@ -136,24 +137,24 @@ int main(int argc, char ** argv){
          ngram_cache_context.clear();
      }
  
-    LOG_TEE("\n");
+    LOG("\n");
  
-    LOG_TEE("\n");
-    LOG_TEE("n_draft      = %d\n", n_draft);
-    LOG_TEE("n_predict    = %d\n", n_input - n_input % n_ctx);
-    LOG_TEE("n_drafted    = %d\n", n_drafted);
-    LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
-    LOG_TEE("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
+    LOG_INF("\n");
+    LOG_INF("n_draft      = %d\n", n_draft);
+    LOG_INF("n_predict    = %d\n", n_input - n_input % n_ctx);
+    LOG_INF("n_drafted    = %d\n", n_drafted);
+    LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
+    LOG_INF("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
              t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
-    LOG_TEE("n_accept     = %d\n", n_accept);
-    LOG_TEE("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
+    LOG_INF("n_accept     = %d\n", n_accept);
+    LOG_INF("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
  
      llama_free(ctx);
      llama_free_model(model);
  
      llama_backend_free();
  
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
  
      return 0;
  }
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp

index be6f8d7d7b6e98e77a1be418858266de6625b76d..2ccd0e6c1881422f9b78cb9147076477f820af81 100644 (file)
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -3,6 +3,7 @@
  #include "common.h"
  #include "ngram-cache.h"
  #include "sampling.h"
+#include "log.h"
  #include "llama.h"
  
  #include <cstdint>
@@ -18,17 +19,13 @@ int main(int argc, char ** argv){
          return 1;
      }
  
+    gpt_init();
+
      // max. number of additional tokens to draft if match is found
      const int n_draft = params.n_draft;
  
      const bool dump_kv_cache = params.dump_kv_cache;
  
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("lookup", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
-
      // init llama.cpp
      llama_backend_init();
      llama_numa_init(params.numa);
@@ -58,7 +55,7 @@ int main(int argc, char ** argv){
              try {
                  ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
              } catch (std::ifstream::failure const &) {
-                fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
+                LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
                  exit(1);
              }
          }
@@ -76,14 +73,14 @@ int main(int argc, char ** argv){
      const int max_tokens_list_size = max_context_size - 4;
  
      if ((int) inp.size() > max_tokens_list_size) {
-        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
+        LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
          return 1;
      }
  
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
  
      for (auto id : inp) {
-        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
+        LOG("%s", llama_token_to_piece(ctx, id).c_str());
      }
  
      fflush(stderr);
@@ -124,7 +121,7 @@ int main(int argc, char ** argv){
          }
  
          // print current draft sequence
-        LOG("drafted %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, draft).c_str());
+        LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());
  
          int i_dft = 0;
          while (true) {
@@ -136,7 +133,7 @@ int main(int argc, char ** argv){
              const std::string token_str = llama_token_to_piece(ctx, id);
  
              if (!params.use_color) {
-                printf("%s", token_str.c_str());
+                LOG("%s", token_str.c_str());
              }
  
              if (llama_token_is_eog(model, id)) {
@@ -147,7 +144,7 @@ int main(int argc, char ** argv){
  
              // check if the target token matches the draft
              if (i_dft < (int) draft.size() && id == draft[i_dft]) {
-                LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
+                LOG_DBG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
                  ++n_accept;
                  ++n_past;
                  ++i_dft;
@@ -161,19 +158,19 @@ int main(int argc, char ** argv){
  
                  if (params.use_color) {
                      // color accepted draft token
-                    printf("\033[34m%s\033[0m", token_str.c_str());
+                    LOG("\033[34m%s\033[0m", token_str.c_str());
                      fflush(stdout);
                  }
                  continue;
              }
  
              if (params.use_color) {
-                printf("%s", token_str.c_str());
+                LOG("%s", token_str.c_str());
              }
              fflush(stdout);
  
  
-            LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
+            LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
  
              draft.clear();
              draft.push_back(id);
@@ -224,22 +221,22 @@ int main(int argc, char ** argv){
      llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
      llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
  
-    LOG_TEE("\n\n");
+    LOG("\n\n");
  
-    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
-    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
+    LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
+    LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
  
-    LOG_TEE("\n");
-    LOG_TEE("n_draft      = %d\n", n_draft);
-    LOG_TEE("n_predict    = %d\n", n_predict);
-    LOG_TEE("n_drafted    = %d\n", n_drafted);
-    LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
-    LOG_TEE("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
+    LOG_INF("\n");
+    LOG_INF("n_draft      = %d\n", n_draft);
+    LOG_INF("n_predict    = %d\n", n_predict);
+    LOG_INF("n_drafted    = %d\n", n_drafted);
+    LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
+    LOG_INF("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
              t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
-    LOG_TEE("n_accept     = %d\n", n_accept);
-    LOG_TEE("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
+    LOG_INF("n_accept     = %d\n", n_accept);
+    LOG_INF("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
  
-    LOG_TEE("\ntarget:\n\n");
+    LOG_INF("\ntarget:\n\n");
      gpt_perf_print(ctx, smpl);
  
      gpt_sampler_free(smpl);
@@ -251,7 +248,7 @@ int main(int argc, char ** argv){
  
      llama_backend_free();
  
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
  
      return 0;
  }
diff --git a/examples/main/main.cpp b/examples/main/main.cpp

index f41be53082a45ccef88b358ec5615e22648fda21..d9e45ce2fb5374777a0ae879538d85291df029eb 100644 (file)
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -1,12 +1,11 @@
  #include "arg.h"
  #include "common.h"
  #include "console.h"
+#include "log.h"
  #include "sampling.h"
  #include "llama.h"
  
  #include <cassert>
-#include <cinttypes>
-#include <cmath>
  #include <cstdio>
  #include <cstring>
  #include <ctime>
@@ -42,11 +41,13 @@ static std::vector<llama_token> * g_output_tokens;
  static bool is_interacting  = false;
  static bool need_insert_eot = false;
  
-static void print_usage(int, char ** argv) {
-    printf("\nexample usage:\n");
-    printf("\n  text generation:     %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
-    printf("\n  chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
-    printf("\n");
+static void print_usage(int argc, char ** argv) {
+    (void) argc;
+
+    LOG("\nexample usage:\n");
+    LOG("\n  text generation:     %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
+    LOG("\n  chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
+    LOG("\n");
  }
  
  static bool file_exists(const std::string & path) {
@@ -74,8 +75,7 @@ static void write_logfile(
  
      const bool success = fs_create_directory_with_parents(params.logdir);
      if (!success) {
-        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
-                __func__, params.logdir.c_str());
+        LOG_ERR("%s: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str());
          return;
      }
  
@@ -83,7 +83,7 @@ static void write_logfile(
      FILE * logfile = fopen(logfile_path.c_str(), "w");
  
      if (logfile == NULL) {
-        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
+        LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
          return;
      }
  
@@ -113,7 +113,7 @@ static void sigint_handler(int signo) {
              need_insert_eot = true;
          } else {
              console::cleanup();
-            printf("\n");
+            LOG("\n");
              gpt_perf_print(*g_ctx, *g_smpl);
              write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
              _exit(130);
@@ -122,17 +122,11 @@ static void sigint_handler(int signo) {
  }
  #endif
  
-static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
-    (void) level;
-    (void) user_data;
-    LOG_TEE("%s", text);
-}
-
-static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
+static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
      llama_chat_msg new_msg{role, content};
      auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
      chat_msgs.push_back({role, content});
-    LOG("formatted: %s\n", formatted.c_str());
+    LOG_DBG("formatted: '%s'\n", formatted.c_str());
      return formatted;
  }
  
@@ -143,17 +137,9 @@ int main(int argc, char ** argv) {
          return 1;
      }
  
-    auto & sparams = params.sparams;
-
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("main", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-    llama_log_set(llama_log_callback_logTee, nullptr);
-#endif // LOG_DISABLE_LOGS
+    gpt_init();
  
-    // TODO: Dump params ?
-    //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
+    auto & sparams = params.sparams;
  
      // save choice to use color for later
      // (note for later: this is a slightly awkward choice)
@@ -161,37 +147,36 @@ int main(int argc, char ** argv) {
      atexit([]() { console::cleanup(); });
  
      if (params.logits_all) {
-        printf("\n************\n");
-        printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
-        printf("************\n\n");
+        LOG_ERR("************\n");
+        LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
+        LOG_ERR("************\n\n");
  
          return 0;
      }
  
      if (params.embedding) {
-        printf("\n************\n");
-        printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
-        printf("************\n\n");
+        LOG_ERR("************\n");
+        LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
+        LOG_ERR("************\n\n");
  
          return 0;
      }
  
      if (params.n_ctx != 0 && params.n_ctx < 8) {
-        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+        LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
          params.n_ctx = 8;
      }
  
      if (params.rope_freq_base != 0.0) {
-        LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
+        LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
      }
  
      if (params.rope_freq_scale != 0.0) {
-        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
+        LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
      }
  
-    print_build_info();
+    LOG_INF("%s: llama backend init\n", __func__);
  
-    LOG("%s: llama backend init\n", __func__);
      llama_backend_init();
      llama_numa_init(params.numa);
  
@@ -206,21 +191,19 @@ int main(int argc, char ** argv) {
      g_smpl = &smpl;
  
      // load the model and apply lora adapter, if any
-    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
+    LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
      llama_init_result llama_init = llama_init_from_gpt_params(params);
  
      model = llama_init.model;
      ctx = llama_init.context;
  
      if (model == NULL) {
-        LOG_TEE("%s: error: unable to load model\n", __func__);
+        LOG_ERR("%s: error: unable to load model\n", __func__);
          return 1;
      }
  
-    LOG("%s: llama threadpool init = n_threads = %d\n",
-        __func__,
-        (int) params.cpuparams.n_threads
-    );
+    LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
+
      struct ggml_threadpool_params tpp_batch =
              ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
      struct ggml_threadpool_params tpp =
@@ -232,8 +215,8 @@ int main(int argc, char ** argv) {
      if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
          threadpool_batch = ggml_threadpool_new(&tpp_batch);
          if (!threadpool_batch) {
-            LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
-            exit(1);
+            LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
+            return 1;
          }
  
          // Start the non-batch threadpool in the paused state
@@ -242,55 +225,54 @@ int main(int argc, char ** argv) {
  
      struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
      if (!threadpool) {
-        LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
-        exit(1);
+        LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+        return 1;
      }
  
      llama_attach_threadpool(ctx, threadpool, threadpool_batch);
  
      const int n_ctx_train = llama_n_ctx_train(model);
      const int n_ctx = llama_n_ctx(ctx);
-    LOG("n_ctx: %d\n", n_ctx);
  
      if (n_ctx > n_ctx_train) {
-        LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, n_ctx);
+        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
      }
  
      // print chat template example in conversation mode
      if (params.conversation) {
          if (params.enable_chat_template) {
-            LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
+            LOG_INF("%s: chat template example:\n%s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
          } else {
-            LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
+            LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
          }
      }
  
      // print system information
      {
-        LOG_TEE("\n");
-        LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
      }
  
      std::string path_session = params.path_prompt_cache;
      std::vector<llama_token> session_tokens;
  
      if (!path_session.empty()) {
-        LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
+        LOG_INF("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
          if (!file_exists(path_session)) {
-            LOG_TEE("%s: session file does not exist, will create.\n", __func__);
+            LOG_INF("%s: session file does not exist, will create.\n", __func__);
          } else if (file_is_empty(path_session)) {
-            LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__);
+            LOG_INF("%s: The session file is empty. A new session will be initialized.\n", __func__);
          } else {
              // The file exists and is not empty
              session_tokens.resize(n_ctx);
              size_t n_token_count_out = 0;
              if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
-                LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
+                LOG_ERR("%s: failed to load session file '%s'\n", __func__, path_session.c_str());
                  return 1;
              }
              session_tokens.resize(n_token_count_out);
-            LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
+            LOG_INF("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
          }
      }
  
@@ -298,7 +280,8 @@ int main(int argc, char ** argv) {
      if (!llama_model_has_encoder(model)) {
          GGML_ASSERT(!llama_add_eos_token(model));
      }
-    LOG("add_bos: %d\n", add_bos);
+
+    LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos);
  
      std::vector<llama_token> embd_inp;
  
@@ -307,31 +290,31 @@ int main(int argc, char ** argv) {
              ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
              : params.prompt;
          if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
-            LOG("tokenize the prompt\n");
+            LOG_DBG("tokenize the prompt\n");
              embd_inp = ::llama_tokenize(ctx, prompt, true, true);
          } else {
-            LOG("use session tokens\n");
+            LOG_DBG("use session tokens\n");
              embd_inp = session_tokens;
          }
  
-        LOG("prompt: \"%s\"\n", log_tostr(prompt));
-        LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+        LOG_DBG("prompt: \"%s\"\n", prompt.c_str());
+        LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
      }
  
      // Should not run without any tokens
      if (embd_inp.empty()) {
          if (add_bos) {
              embd_inp.push_back(llama_token_bos(model));
-            LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+            LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
          } else {
-            LOG_TEE("error: input is empty\n");
+            LOG_ERR("input is empty\n");
              return -1;
          }
      }
  
      // Tokenize negative prompt
      if ((int) embd_inp.size() > n_ctx - 4) {
-        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+        LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
          return 1;
      }
  
@@ -345,29 +328,28 @@ int main(int argc, char ** argv) {
              n_matching_session_tokens++;
          }
          if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
-            LOG_TEE("%s: using full prompt from session file\n", __func__);
+            LOG_INF("%s: using full prompt from session file\n", __func__);
          } else if (n_matching_session_tokens >= embd_inp.size()) {
-            LOG_TEE("%s: session file has exact match for prompt!\n", __func__);
+            LOG_INF("%s: session file has exact match for prompt!\n", __func__);
          } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
-            LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
-                __func__, n_matching_session_tokens, embd_inp.size());
+            LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
+                    __func__, n_matching_session_tokens, embd_inp.size());
          } else {
-            LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
-                __func__, n_matching_session_tokens, embd_inp.size());
+            LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n",
+                    __func__, n_matching_session_tokens, embd_inp.size());
          }
  
          // remove any "future" tokens that we might have inherited from the previous session
          llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
      }
  
-    LOGLN(
-            "recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu",
-            log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
+    LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
+         embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
  
      // if we will use the cache for the full prompt without reaching the end of the cache, force
      // reevaluation of the last token to recalculate the cached logits
      if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
-        LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1);
+        LOG_DBG("recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1);
  
          session_tokens.resize(embd_inp.size() - 1);
      }
@@ -389,21 +371,20 @@ int main(int argc, char ** argv) {
      }
  
      if (params.verbose_prompt) {
-        LOG_TEE("\n");
-        LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-        LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+        LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+        LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
          for (int i = 0; i < (int) embd_inp.size(); i++) {
-            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+            LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
          }
  
          if (params.n_keep > add_bos) {
-            LOG_TEE("%s: static prompt based on n_keep: '", __func__);
+            LOG_INF("%s: static prompt based on n_keep: '", __func__);
              for (int i = 0; i < params.n_keep; i++) {
-                LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+                LOG("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
              }
-            LOG_TEE("'\n");
+            LOG("'\n");
          }
-        LOG_TEE("\n");
+        LOG_INF("\n");
      }
  
      // ctrl+C handling
@@ -423,40 +404,40 @@ int main(int argc, char ** argv) {
      }
  
      if (params.interactive) {
-        LOG_TEE("%s: interactive mode on.\n", __func__);
+        LOG("%s: interactive mode on.\n", __func__);
  
          if (!params.antiprompt.empty()) {
              for (const auto & antiprompt : params.antiprompt) {
-                LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
+                LOG("Reverse prompt: '%s'\n", antiprompt.c_str());
                  if (params.verbose_prompt) {
                      auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
                      for (int i = 0; i < (int) tmp.size(); i++) {
-                        LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                        LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
                      }
                  }
              }
          }
  
          if (params.input_prefix_bos) {
-            LOG_TEE("Input prefix with BOS\n");
+            LOG("Input prefix with BOS\n");
          }
  
          if (!params.input_prefix.empty()) {
-            LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
+            LOG("Input prefix: '%s'\n", params.input_prefix.c_str());
              if (params.verbose_prompt) {
                  auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
                  for (int i = 0; i < (int) tmp.size(); i++) {
-                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
                  }
              }
          }
  
          if (!params.input_suffix.empty()) {
-            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
+            LOG("Input suffix: '%s'\n", params.input_suffix.c_str());
              if (params.verbose_prompt) {
                  auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
                  for (int i = 0; i < (int) tmp.size(); i++) {
-                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
                  }
              }
          }
@@ -464,15 +445,15 @@ int main(int argc, char ** argv) {
  
      smpl = gpt_sampler_init(model, sparams);
      if (!smpl) {
-        fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
-        exit(1);
+        LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
+        return 1;
      }
  
-    LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl));
-    LOG_TEE("sampling params: \n%s\n", sparams.print().c_str());
-    LOG_TEE("sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str());
+    LOG_INF("sampler seed: %u\n",     gpt_sampler_get_seed(smpl));
+    LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
+    LOG_INF("sampler chain: %s\n",    gpt_sampler_print(smpl).c_str());
  
-    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+    LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
  
      // group-attention state
      // number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
@@ -486,9 +467,9 @@ int main(int argc, char ** argv) {
          GGML_ASSERT(ga_w % ga_n == 0            && "grp_attn_w must be a multiple of grp_attn_n");     // NOLINT
        //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of grp_attn_w");    // NOLINT
        //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
-        LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
+        LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
      }
-    LOG_TEE("\n\n");
+    LOG("\n");
  
      if (params.interactive) {
          const char * control_message;
@@ -500,11 +481,11 @@ int main(int argc, char ** argv) {
                                " - To return control without starting a new line, end your input with '/'.\n"
                                " - If you want to submit another line, end your input with '\\'.\n";
          }
-        LOG_TEE("== Running in interactive mode. ==\n");
+        LOG("== Running in interactive mode. ==\n");
  #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-        LOG_TEE(       " - Press Ctrl+C to interject at any time.\n");
+        LOG(       " - Press Ctrl+C to interject at any time.\n");
  #endif
-        LOG_TEE(       "%s\n", control_message);
+        LOG(       "%s\n", control_message);
  
          is_interacting = params.interactive_first;
      }
@@ -543,7 +524,7 @@ int main(int argc, char ** argv) {
          llama_token * enc_input_buf = embd_inp.data();
  
          if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
-            LOG_TEE("%s : failed to eval\n", __func__);
+            LOG_ERR("%s : failed to eval\n", __func__);
              return 1;
          }
  
@@ -569,9 +550,8 @@ int main(int argc, char ** argv) {
                  embd.resize(max_embd_size);
  
                  console::set_display(console::error);
-                printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+                LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
                  console::set_display(console::reset);
-                fflush(stdout);
              }
  
              if (ga_n == 1) {
@@ -581,14 +561,14 @@ int main(int argc, char ** argv) {
                  // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
                  if (n_past + (int) embd.size() >= n_ctx) {
                      if (params.n_predict == -2) {
-                        LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
+                        LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                          break;
                      }
  
                      const int n_left    = n_past - params.n_keep;
                      const int n_discard = n_left/2;
  
-                    LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
+                    LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                              n_past, n_left, n_ctx, params.n_keep, n_discard);
  
                      llama_kv_cache_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
@@ -596,11 +576,11 @@ int main(int argc, char ** argv) {
  
                      n_past -= n_discard;
  
-                    LOG("after swap: n_past = %d\n", n_past);
+                    LOG_DBG("after swap: n_past = %d\n", n_past);
  
-                    LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
+                    LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
  
-                    LOG("clear session path\n");
+                    LOG_DBG("clear session path\n");
                      path_session.clear();
                  }
              } else {
@@ -610,10 +590,10 @@ int main(int argc, char ** argv) {
                      const int bd = (ga_w/ga_n)*(ga_n - 1);
                      const int dd = (ga_w/ga_n) - ib*bd - ga_w;
  
-                    LOG("\n");
-                    LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
-                    LOG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
-                    LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
+                    LOG_DBG("\n");
+                    LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
+                    LOG_DBG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
+                    LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
  
                      llama_kv_cache_seq_add(ctx, 0, ga_i,                n_past,              ib*bd);
                      llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
@@ -623,7 +603,7 @@ int main(int argc, char ** argv) {
  
                      ga_i += ga_w/ga_n;
  
-                    LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
+                    LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
                  }
              }
  
@@ -655,19 +635,19 @@ int main(int argc, char ** argv) {
                      n_eval = params.n_batch;
                  }
  
-                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
+                LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
  
                  if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
-                    LOG_TEE("%s : failed to eval\n", __func__);
+                    LOG_ERR("%s : failed to eval\n", __func__);
                      return 1;
                  }
  
                  n_past += n_eval;
  
-                LOG("n_past = %d\n", n_past);
+                LOG_DBG("n_past = %d\n", n_past);
                  // Display total tokens alongside total time
                  if (params.n_print > 0 && n_past % params.n_print == 0) {
-                    LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
+                    LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
                  }
              }
  
@@ -685,14 +665,14 @@ int main(int argc, char ** argv) {
                  need_to_save_session = false;
                  llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
  
-                LOG("saved session to %s\n", path_session.c_str());
+                LOG_DBG("saved session to %s\n", path_session.c_str());
              }
  
              const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
  
-            gpt_sampler_accept(smpl, id, /* apply_grammar= */ true);
+            gpt_sampler_accept(smpl, id, /* accept_grammar= */ true);
  
-            // LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str());
+            // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
  
              embd.push_back(id);
  
@@ -702,16 +682,16 @@ int main(int argc, char ** argv) {
              // decrement remaining sampling budget
              --n_remain;
  
-            LOG("n_remain: %d\n", n_remain);
+            LOG_DBG("n_remain: %d\n", n_remain);
          } else {
              // some user input remains from prompt or interaction, forward it to processing
-            LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
+            LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
              while ((int) embd_inp.size() > n_consumed) {
                  embd.push_back(embd_inp[n_consumed]);
  
                  // push the prompt in the sampling context in order to apply repetition penalties later
                  // for the prompt, we don't apply grammar rules
-                gpt_sampler_accept(smpl, embd_inp[n_consumed], /* apply_grammar= */ false);
+                gpt_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
  
                  ++n_consumed;
                  if ((int) embd.size() >= params.n_batch) {
@@ -726,7 +706,7 @@ int main(int argc, char ** argv) {
                  const std::string token_str = llama_token_to_piece(ctx, id, params.special);
  
                  // Console/Stream Output
-                fprintf(stdout, "%s", token_str.c_str());
+                LOG("%s", token_str.c_str());
  
                  // Record Displayed Tokens To Log
                  // Note: Generated tokens are created one by one hence this check
@@ -738,8 +718,6 @@ int main(int argc, char ** argv) {
                      output_tokens.push_back(id);
                      output_ss << token_str;
                  }
-
-                fflush(stdout);
              }
          }
  
@@ -788,13 +766,13 @@ int main(int argc, char ** argv) {
                  }
  
                  if (is_antiprompt) {
-                    LOG("found antiprompt: %s\n", last_output.c_str());
+                    LOG_DBG("found antiprompt: %s\n", last_output.c_str());
                  }
              }
  
              // deal with end of generation tokens in interactive mode
              if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
-                LOG("found an EOG token\n");
+                LOG_DBG("found an EOG token\n");
  
                  if (params.interactive) {
                      if (!params.antiprompt.empty()) {
@@ -808,7 +786,7 @@ int main(int argc, char ** argv) {
                          chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
                      }
                      is_interacting = true;
-                    printf("\n");
+                    LOG("\n");
                  }
              }
  
@@ -819,21 +797,21 @@ int main(int argc, char ** argv) {
              }
  
              if (n_past > 0 && is_interacting) {
-                LOG("waiting for user input\n");
+                LOG_DBG("waiting for user input\n");
  
                  if (params.conversation) {
-                    printf("\n> ");
+                    LOG("\n> ");
                  }
  
                  if (params.input_prefix_bos) {
-                    LOG("adding input prefix BOS token\n");
+                    LOG_DBG("adding input prefix BOS token\n");
                      embd_inp.push_back(llama_token_bos(model));
                  }
  
                  std::string buffer;
                  if (!params.input_prefix.empty() && !params.conversation) {
-                    LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
-                    printf("%s", params.input_prefix.c_str());
+                    LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
+                    LOG("%s", params.input_prefix.c_str());
                  }
  
                  // color user input only
@@ -856,11 +834,11 @@ int main(int argc, char ** argv) {
                  if (buffer.length() > 1) {
                      // append input suffix if any
                      if (!params.input_suffix.empty() && !params.conversation) {
-                        LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
-                        printf("%s", params.input_suffix.c_str());
+                        LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
+                        LOG("%s", params.input_suffix.c_str());
                      }
  
-                    LOG("buffer: '%s'\n", buffer.c_str());
+                    LOG_DBG("buffer: '%s'\n", buffer.c_str());
  
                      const size_t original_size = embd_inp.size();
  
@@ -877,7 +855,7 @@ int main(int argc, char ** argv) {
                      const auto line_inp = ::llama_tokenize(ctx, user_inp,            false, format_chat);
                      const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
  
-                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
+                    LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
  
                      // if user stop generation mid-way, we must add EOT to finish model's last response
                      if (need_insert_eot && format_chat) {
@@ -900,9 +878,9 @@ int main(int argc, char ** argv) {
                      assistant_ss.str("");
  
                      n_remain -= line_inp.size();
-                    LOG("n_remain: %d\n", n_remain);
+                    LOG_DBG("n_remain: %d\n", n_remain);
                  } else {
-                    LOG("empty line, passing control back\n");
+                    LOG_DBG("empty line, passing control back\n");
                  }
  
                  input_echo = false; // do not echo this again
@@ -918,7 +896,7 @@ int main(int argc, char ** argv) {
  
          // end of generation
          if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
-            LOG_TEE(" [end of text]\n");
+            LOG(" [end of text]\n");
              break;
          }
  
@@ -931,11 +909,11 @@ int main(int argc, char ** argv) {
      }
  
      if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
-        LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
+        LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
          llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
      }
  
-    LOG_TEE("\n");
+    LOG("\n\n");
      gpt_perf_print(ctx, smpl);
      write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
  
@@ -949,9 +927,5 @@ int main(int argc, char ** argv) {
      ggml_threadpool_free(threadpool);
      ggml_threadpool_free(threadpool_batch);
  
-#ifndef LOG_DISABLE_LOGS
-    LOG_TEE("Log end\n");
-#endif // LOG_DISABLE_LOGS
-
      return 0;
  }
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp

index 758393c3d767ab188206daf41262e7aa5b191612..81e2f7ed7c8253bf56fef60f8d528f6acdac93ae 100644 (file)
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -4,6 +4,7 @@
  #include "arg.h"
  #include "common.h"
  #include "sampling.h"
+#include "log.h"
  #include "llama.h"
  
  #include <cmath>
@@ -83,7 +84,9 @@ static void print_date_time() {
      char buffer[80];
      strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time);
  
-    printf("\n\033[35mrun parameters as at %s\033[0m\n", buffer);
+    LOG_INF("\n");
+    LOG_INF("\033[35mrun parameters as of %s\033[0m\n", buffer);
+    LOG_INF("\n");
  }
  
  // Define a split string function to ...
@@ -106,6 +109,8 @@ int main(int argc, char ** argv) {
          return 1;
      }
  
+    gpt_init();
+
      // number of simultaneous "clients" to simulate
      const int32_t n_clients = params.n_parallel;
  
@@ -120,12 +125,6 @@ int main(int argc, char ** argv) {
  
      const bool dump_kv_cache = params.dump_kv_cache;
  
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("parallel", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
-
      // init llama.cpp
      llama_backend_init();
      llama_numa_init(params.numa);
@@ -138,23 +137,22 @@ int main(int argc, char ** argv) {
  
      // load the prompts from an external file if there are any
      if (params.prompt.empty()) {
-        printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
+        LOG_INF("\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
      } else {
          // Output each line of the input params.prompts vector and copy to k_prompts
          int index = 0;
-        printf("\n\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
+        LOG_INF("\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
  
          std::vector<std::string> prompts = split_string(params.prompt, '\n');
          for (const auto& prompt : prompts) {
              k_prompts.resize(index + 1);
              k_prompts[index] = prompt;
              index++;
-            printf("%3d prompt: %s\n", index, prompt.c_str());
+            LOG_INF("%3d prompt: %s\n", index, prompt.c_str());
          }
      }
  
-    fprintf(stderr, "\n\n");
-    fflush(stderr);
+    LOG_INF("\n\n");
  
      const int n_ctx = llama_n_ctx(ctx);
  
@@ -183,19 +181,19 @@ int main(int argc, char ** argv) {
  
      const auto t_main_start = ggml_time_us();
  
-    LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
-    LOG_TEE("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
-    LOG_TEE("\n");
+    LOG_INF("%s: Simulating parallel requests from clients:\n", __func__);
+    LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
+    LOG_INF("\n");
  
      {
-        LOG_TEE("%s: Evaluating the system prompt ...\n", __func__);
+        LOG_INF("%s: Evaluating the system prompt ...\n", __func__);
  
          for (int32_t i = 0; i < n_tokens_system; ++i) {
              llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
          }
  
          if (llama_decode(ctx, batch) != 0) {
-            LOG_TEE("%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
              return 1;
          }
  
@@ -204,10 +202,10 @@ int main(int argc, char ** argv) {
              llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
          }
  
-        LOG_TEE("\n");
+        LOG_INF("\n");
      }
  
-    LOG_TEE("Processing requests ...\n\n");
+    LOG_INF("Processing requests ...\n\n");
  
      while (true) {
          if (dump_kv_cache) {
@@ -238,7 +236,7 @@ int main(int argc, char ** argv) {
                  llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
              }
  
-            LOG_TEE("%s: clearing the KV cache\n", __func__);
+            LOG_INF("%s: clearing the KV cache\n", __func__);
          }
  
          // insert new sequences for decoding
@@ -273,7 +271,7 @@ int main(int argc, char ** argv) {
                      client.n_decoded = 0;
                      client.i_batch   = batch.n_tokens - 1;
  
-                    LOG_TEE("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
+                    LOG_INF("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
  
                      g_seq_id += 1;
  
@@ -317,11 +315,11 @@ int main(int argc, char ** argv) {
              if (ret != 0) {
                  if (n_batch == 1 || ret < 0) {
                      // if you get here, it means the KV cache is full - try increasing it via the context size
-                    LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
+                    LOG_ERR("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
                      return 1;
                  }
  
-                LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
+                LOG_ERR("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
  
                  n_cache_miss += 1;
  
@@ -332,7 +330,7 @@ int main(int argc, char ** argv) {
                  continue;
              }
  
-            LOG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
+            LOG_DBG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
  
              for (auto & client : clients) {
                  if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
@@ -377,7 +375,7 @@ int main(int argc, char ** argv) {
  
                      const auto t_main_end = ggml_time_us();
  
-                    LOG_TEE("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \nInput:    %s\n\033[35mResponse: %s\033[0m\n\n",
+                    LOG_INF("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput:    %s\n\033[35mResponse: %s\033[0m\n\n",
                              client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded,
                              (t_main_end - client.t_start_prompt) / 1e6,
                              (double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
@@ -400,19 +398,19 @@ int main(int argc, char ** argv) {
  
      print_date_time();
  
-    LOG_TEE("\n%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
+    LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
      if (params.prompt_file.empty()) {
          params.prompt_file = "used built-in defaults";
      }
-    LOG_TEE("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
-    LOG_TEE("Model and path used:  \033[32m%s\033[0m\n\n", params.model.c_str());
+    LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
+    LOG_INF("Model and path used:  \033[32m%s\033[0m\n\n", params.model.c_str());
  
-    LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt              ) / (t_main_end - t_main_start) * 1e6);
-    LOG_TEE("Total gen tokens:    %6d, speed: %5.2f t/s\n", n_total_gen,    (double) (n_total_gen                 ) / (t_main_end - t_main_start) * 1e6);
-    LOG_TEE("Total speed (AVG):   %6s  speed: %5.2f t/s\n", "",             (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
-    LOG_TEE("Cache misses:        %6d\n", n_cache_miss);
+    LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt              ) / (t_main_end - t_main_start) * 1e6);
+    LOG_INF("Total gen tokens:    %6d, speed: %5.2f t/s\n", n_total_gen,    (double) (n_total_gen                 ) / (t_main_end - t_main_start) * 1e6);
+    LOG_INF("Total speed (AVG):   %6s  speed: %5.2f t/s\n", "",             (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
+    LOG_INF("Cache misses:        %6d\n", n_cache_miss);
  
-    LOG_TEE("\n");
+    LOG_INF("\n");
  
      // TODO: print sampling/grammar timings for all clients
      llama_perf_context_print(ctx);
@@ -424,7 +422,7 @@ int main(int argc, char ** argv) {
  
      llama_backend_free();
  
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
  
      return 0;
  }
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp

index 52aa68bfcdf3cb348ea89e7c41dc928998283a85..7ef8d14f37482dc42c666e085159074210eb8e21 100644 (file)
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -1,5 +1,6 @@
  #include "arg.h"
  #include "common.h"
+#include "log.h"
  #include "llama.h"
  
  #include <cmath>
@@ -8,9 +9,9 @@
  #include <vector>
  
  static void print_usage(int, char ** argv) {
-    LOG_TEE("\nexample usage:\n");
-    LOG_TEE("\n    %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
-    LOG_TEE("\n");
+    LOG("\nexample usage:\n");
+    LOG("\n    %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
+    LOG("\n");
  }
  
  int main(int argc, char ** argv) {
@@ -24,6 +25,8 @@ int main(int argc, char ** argv) {
          return 1;
      }
  
+    gpt_init();
+
      int n_junk = params.n_junk;
      int n_keep = params.n_keep;
      int n_grp  = params.grp_attn_n;
@@ -63,7 +66,7 @@ int main(int argc, char ** argv) {
      llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
  
      if (model == NULL) {
-        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+        LOG_ERR("%s: unable to load model\n" , __func__);
          return 1;
      }
  
@@ -77,7 +80,7 @@ int main(int argc, char ** argv) {
  
      llama_context * ctx = llama_new_context_with_model(model, ctx_params);
      if (ctx == NULL) {
-        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
          return 1;
      }
  
@@ -107,14 +110,14 @@ int main(int argc, char ** argv) {
      const int n_batch     = ctx_params.n_batch;
      const int n_batch_grp = ctx_params.n_batch/n_grp;
  
-    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
+    LOG_INF("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
  
      // print the prompt token-by-token
  
-    LOG_TEE("\n");
-    LOG_TEE("prefix tokens: %d\n", n_tokens_prefix);
-    LOG_TEE("prompt tokens: %d\n", n_tokens_all);
-    //LOG_TEE("prompt: %s\n", params.prompt.c_str());
+    LOG_INF("\n");
+    LOG_INF("prefix tokens: %d\n", n_tokens_prefix);
+    LOG_INF("prompt tokens: %d\n", n_tokens_all);
+    //LOG_INF("prompt: %s\n", params.prompt.c_str());
  
      llama_batch batch = llama_batch_init(params.n_batch, 0, 1);
  
@@ -145,11 +148,11 @@ int main(int argc, char ** argv) {
          }
  
          if (llama_decode(ctx, batch) != 0) {
-            LOG_TEE("%s: llama_decode() failed\n", __func__);
+            LOG_INF("%s: llama_decode() failed\n", __func__);
              return 1;
          }
  
-        LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
+        LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
  
          if (i + n_batch >= n_tokens_all) {
              break;
@@ -159,7 +162,7 @@ int main(int argc, char ** argv) {
      for (int i = n_ctx; i < n_tokens_all; i += n_batch) {
          const int n_discard = n_batch;
  
-        LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard);
+        LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
  
          llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
          llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
@@ -179,18 +182,18 @@ int main(int argc, char ** argv) {
          }
  
          if (llama_decode(ctx, batch) != 0) {
-            LOG_TEE("%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
              return 1;
          }
  
-        LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
+        LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
      }
  
      {
          const int n_discard = n_past - n_ctx + n_predict;
  
          if (n_discard > 0) {
-            LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
+            LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
  
              llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
              llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
@@ -201,17 +204,16 @@ int main(int argc, char ** argv) {
          }
      }
  
-    LOG_TEE("\n");
-    LOG_TEE("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk);
-    LOG_TEE("\n");
+    LOG_INF("\n");
+    LOG_INF("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk);
+    LOG_INF("\n");
  
      // main loop
  
      int n_cur    = n_tokens_all;
      int n_decode = 0;
  
-    LOG_TEE("%s", prompt_suffix.c_str());
-    fflush(stdout);
+    LOG_INF("%s", prompt_suffix.c_str());
  
      const auto t_main_start = ggml_time_us();
  
@@ -222,13 +224,12 @@ int main(int argc, char ** argv) {
  
              // is it an end of generation?
              if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
-                LOG_TEE("\n");
+                LOG("\n");
  
                  break;
              }
  
-            LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
-            fflush(stdout);
+            LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
  
              n_decode += 1;
  
@@ -243,22 +244,22 @@ int main(int argc, char ** argv) {
  
          // evaluate the current batch with the transformer model
          if (llama_decode(ctx, batch)) {
-            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
+            LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
              return 1;
          }
      }
  
-    LOG_TEE("\n");
+    LOG("\n");
  
      const auto t_main_end = ggml_time_us();
  
-    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
+    LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
              __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
  
-    LOG_TEE("\n");
+    LOG("\n");
      llama_perf_context_print(ctx);
  
-    fprintf(stderr, "\n");
+    LOG("\n");
  
      llama_sampler_free(smpl);
  
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp

index 29ff86bbc358e62d2bfa9f05481ee41f9e61fae4..18e75a7a20f1a010801390d3cbc93eb24a784278 100644 (file)
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1,7 +1,9 @@
  #include "arg.h"
  #include "common.h"
+#include "log.h"
  #include "llama.h"
  
+#include <algorithm>
  #include <array>
  #include <atomic>
  #include <cmath>
@@ -41,7 +43,7 @@ static void write_logfile(
      }
  
      if (params.hellaswag) {
-        fprintf(stderr, "%s: warning: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
+        LOG_WRN("%s: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
          return;
      }
  
@@ -49,7 +51,7 @@ static void write_logfile(
  
      const bool success = fs_create_directory_with_parents(params.logdir);
      if (!success) {
-        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
+        LOG_WRN("%s: failed to create logdir %s, cannot write logfile\n",
                  __func__, params.logdir.c_str());
          return;
      }
@@ -58,7 +60,7 @@ static void write_logfile(
      FILE * logfile = fopen(logfile_path.c_str(), "w");
  
      if (logfile == NULL) {
-        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
+        LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
          return;
      }
  
@@ -344,16 +346,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
      const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
      GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
  
-    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
+    LOG_INF("%s: tokenizing the input ..\n", __func__);
  
      std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
  
      const int n_ctx = llama_n_ctx(ctx);
  
      if (int(tokens.size()) < 2*n_ctx) {
-        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
+        LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
                  n_ctx);
-        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
+        LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
          return {std::move(tokens), 0., {}, {}};
      }
  
@@ -364,16 +366,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
      prob_history.resize(tokens.size());
  
      if (params.ppl_stride <= 0) {
-        fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
+        LOG_ERR("%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
          return {tokens, -1, logit_history, prob_history};
      }
  
      const int calc_chunk = n_ctx;
  
-    fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
+    LOG_INF("%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
  
      if (int(tokens.size()) <= calc_chunk) {
-        fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
+        LOG_ERR("%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
                  tokens.size(), n_ctx, params.ppl_stride);
          return {tokens, -1, logit_history, prob_history};
      }
@@ -387,14 +389,14 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
      int count = 0;
      double nll = 0.0;
  
-    fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
+    LOG_INF("%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
  
      for (int i = 0; i < n_chunk; ++i) {
          const int start =     i * params.ppl_stride;
          const int end   = start + calc_chunk;
  
          const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
-        //fprintf(stderr, "%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
+        //LOG_DBG("%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
  
          std::vector<float> logits;
  
@@ -407,10 +409,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
              const int batch_start = start + j * n_batch;
              const int batch_size  = std::min(end - batch_start, n_batch);
  
-            //fprintf(stderr, "    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
+            //LOG_DBG("    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
              // TODO: use llama_batch.logits instead of relying on logits_all == true
              if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
-                //fprintf(stderr, "%s : failed to eval\n", __func__);
+                //LOG_ERR("%s : failed to eval\n", __func__);
                  return {tokens, -1, logit_history, prob_history};
              }
  
@@ -434,16 +436,17 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
  
          if (i == 0) {
              const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
              int total_seconds = (int)(t_total * n_chunk);
              if (total_seconds >= 60*60) {
-                fprintf(stderr, "%d hours ", total_seconds / (60*60));
+                LOG("%d hours ", total_seconds / (60*60));
                  total_seconds = total_seconds % (60*60);
              }
-            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
+            LOG("%.2f minutes\n", total_seconds / 60.0);
          }
+        LOG("\n");
  
-        //fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
+        //LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
          for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
  
              // Calculate probability of next token, given the previous ones.
@@ -460,13 +463,12 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
          }
          // perplexity is e^(average negative log-likelihood)
          if (params.ppl_output_type == 0) {
-            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+            LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
          } else {
-            printf("%8d  %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
+            LOG("%8d  %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
          }
-        fflush(stdout);
      }
-    printf("\n");
+    LOG("\n");
  
      return {tokens, std::exp(nll / count), logit_history, prob_history};
  }
@@ -488,26 +490,26 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
      if (!params.logits_file.empty()) {
          logits_stream.open(params.logits_file.c_str(), std::ios::binary);
          if (!logits_stream.is_open()) {
-            fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
+            LOG_ERR("%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
              return {};
          }
-        fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
+        LOG_INF("%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
          logits_stream.write("_logits_", 8);
          logits_stream.write(reinterpret_cast<const char *>(&n_ctx), sizeof(n_ctx));
      }
  
      auto tim1 = std::chrono::high_resolution_clock::now();
-    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
+    LOG_INF("%s: tokenizing the input ..\n", __func__);
  
      std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
  
      auto tim2 = std::chrono::high_resolution_clock::now();
-    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
+    LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
  
      if (int(tokens.size()) < 2*n_ctx) {
-        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
+        LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
                  n_ctx);
-        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
+        LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
          return {std::move(tokens), 0., {}, {}};
      }
  
@@ -540,7 +542,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
          logits.reserve((size_t)n_ctx * n_vocab);
      }
  
-    fprintf(stderr, "%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
+    LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
  
      std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
  
@@ -613,7 +615,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
              }
  
              if (llama_decode(ctx, batch)) {
-                fprintf(stderr, "%s : failed to eval\n", __func__);
+                LOG_INF("%s : failed to eval\n", __func__);
                  return {tokens, -1, logit_history, prob_history};
              }
  
@@ -628,14 +630,15 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
              llama_synchronize(ctx);
              const auto t_end = std::chrono::high_resolution_clock::now();
              const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
              int total_seconds = (int)(t_total*n_chunk/n_seq);
              if (total_seconds >= 60*60) {
-                fprintf(stderr, "%d hours ", total_seconds / (60*60));
+                LOG("%d hours ", total_seconds / (60*60));
                  total_seconds = total_seconds % (60*60);
              }
-            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
+            LOG("%.2f minutes\n", total_seconds / 60.0);
          }
+        LOG("\n");
  
          for (int seq = 0; seq < n_seq_batch; seq++) {
              const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx + first);
@@ -656,19 +659,18 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
  
              // perplexity is e^(average negative log-likelihood)
              if (params.ppl_output_type == 0) {
-                printf("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
+                LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
              } else {
                  double av = nll/count;
                  double av2 = nll2/count - av*av;
                  if (av2 > 0) av2 = sqrt(av2/(count-1));
-                printf("%8d  %.4lf  %4lf  %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
+                LOG("%8d  %.4lf  %4lf  %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
              }
          }
-        fflush(stdout);
  
          logits.clear();
      }
-    printf("\n");
+    LOG("\n");
  
      nll2 /= count;
      nll /= count;
@@ -676,9 +678,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
      nll2 -= nll * nll;
      if (nll2 > 0) {
          nll2 = sqrt(nll2/(count-1));
-        printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
+        LOG_INF("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
      } else {
-        printf("Unexpected negative standard deviation of log(prob)\n");
+        LOG_ERR("Unexpected negative standard deviation of log(prob)\n");
      }
  
      llama_batch_free(batch);
@@ -704,7 +706,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
  
          const int ret = llama_decode(ctx, batch_view);
          if (ret != 0) {
-            LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
+            LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
              return false;
          }
  
@@ -790,15 +792,15 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
      }
  
      if (prompt_lines.size() % 6 != 0) {
-        fprintf(stderr, "%s : number of lines in prompt not a multiple of 6.\n", __func__);
+        LOG_ERR("%s : number of lines in prompt not a multiple of 6.\n", __func__);
          return;
      }
  
      size_t hs_task_count = prompt_lines.size()/6;
-    fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
+    LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
  
      const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
-    fprintf(stderr, "================================= is_spm = %d\n", is_spm);
+    LOG_INF("================================= is_spm = %d\n", is_spm);
  
      // The tasks should be randomized so the score stabilizes quickly.
      bool randomize_tasks = true;
@@ -825,7 +827,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
          std::vector<llama_token> seq_tokens[4];
      };
  
-    fprintf(stderr, "%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first")  );
+    LOG_INF("%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first")  );
  
      // Select and read data from prompt lines
      std::vector<hs_data_t> hs_data(hs_task_count);
@@ -871,9 +873,9 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
          }
      }
  
-    fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__);
+    LOG_INF("%s : calculating hellaswag score over selected tasks.\n", __func__);
  
-    printf("\ntask\tacc_norm\n");
+    LOG("\ntask\tacc_norm\n");
  
      double acc = 0.0f;
  
@@ -941,7 +943,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
          }
  
          if (i0 == i1) {
-            fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
+            LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
              return;
          }
  
@@ -949,7 +951,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
  
          // decode all tasks [i0, i1)
          if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
-            fprintf(stderr, "%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
              return;
          }
  
@@ -999,7 +1001,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
                  }
              }
  
-            //printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
+            //LOG("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
  
              // If the gold ending got the maximum logprobe add one accuracy point
              if (ending_logprob_max_idx == hs_cur.gold_ending_idx) {
@@ -1007,8 +1009,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
              }
  
              // Print the accumulated accuracy mean x 100
-            printf("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
-            fflush(stdout);
+            LOG("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
          }
  
          i0 = i1 - 1;
@@ -1016,7 +1017,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
  
      llama_batch_free(batch);
  
-    printf("\n");
+    LOG("\n");
  }
  
  struct winogrande_entry {
@@ -1060,7 +1061,7 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
              }
          }
          if (ipos != 4) {
-            printf("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
+            LOG_ERR("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
              continue;
          }
          auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3)
@@ -1074,13 +1075,13 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
              if (sentence[where] == '_') break;
          }
          if (where == int(sentence.size())) {
-            printf("%s: no _ in <%s>\n", __func__, sentence.c_str());
+            LOG_ERR("%s: no _ in <%s>\n", __func__, sentence.c_str());
              continue;
          }
          std::istringstream stream(answer.c_str());
          int i_answer; stream >> i_answer;
          if (stream.fail() || i_answer < 1 || i_answer > 2) {
-            printf("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
+            LOG_ERR("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
              continue;
          }
          result.emplace_back();
@@ -1109,14 +1110,14 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
  
      auto data = load_winogrande_from_csv(params.prompt);
      if (data.empty()) {
-        fprintf(stderr, "%s: no tasks\n", __func__);
+        LOG_ERR("%s: no tasks\n", __func__);
          return;
      }
  
-    fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, data.size());
+    LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, data.size());
  
      if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) {
-        fprintf(stderr, "%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
+        LOG_INF("%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
          std::mt19937 rng(1);
          std::vector<int> aux(data.size());
          for (int i = 0; i < int(data.size()); ++i) {
@@ -1134,7 +1135,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
          data = std::move(selected);
      }
  
-    fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);
+    LOG_INF("%s : tokenizing selected tasks\n", __func__);
  
      for (auto & task : data) {
          task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
@@ -1157,7 +1158,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
          task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
      }
  
-    fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
+    LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__);
  
      const int n_vocab = llama_n_vocab(llama_get_model(ctx));
      const int n_ctx   = llama_n_ctx(ctx);
@@ -1218,7 +1219,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
          }
  
          if (i0 == i1) {
-            fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
+            LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
              return;
          }
  
@@ -1226,7 +1227,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
  
          // decode all tasks [i0, i1)
          if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
-            fprintf(stderr, "%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
              return;
          }
  
@@ -1286,20 +1287,20 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
              ++n_done;
  
              // print the accumulated accuracy mean x 100
-            printf("%zu\t%.4lf\t%10.6f  %10.6f  %d  %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
-            fflush(stdout);
+            LOG("%zu\t%.4lf\t%10.6f  %10.6f  %d  %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
          }
  
          i0 = i1 - 1;
      }
  
-    printf("\n");
+    LOG("\n");
  
      if (n_done < 100) return;
  
      const float p = 1.f*n_correct/n_done;
      const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1));
-    printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
+
+    LOG_INF("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
  }
  
  static bool deserialize_string(std::istream & in, std::string & str) {
@@ -1348,7 +1349,7 @@ struct multiple_choice_task {
  static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
      if (task.question.empty() || task.mc1.answers.empty()) {
          if (log_error) {
-            printf("%s: found bad task with empty question and/or answers\n", __func__);
+            LOG_ERR("%s: found bad task with empty question and/or answers\n", __func__);
          }
          return false;
      }
@@ -1356,7 +1357,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
      for (auto& answer : task.mc1.answers) {
          if (answer.empty()) {
              if (log_error) {
-                printf("%s: found empty answer\n", __func__);
+                LOG_ERR("%s: found empty answer\n", __func__);
              }
              return false;
          }
@@ -1410,14 +1411,14 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
      uint32_t n_task;
      strstream.read((char *)&n_task, sizeof(n_task));
      if (strstream.fail() || n_task == 0) {
-        printf("%s: no tasks\n", __func__);
+        LOG_ERR("%s: no tasks\n", __func__);
          return;
      }
-    printf("%s: there are %u tasks in prompt\n", __func__, n_task);
+    LOG_INF("%s: there are %u tasks in prompt\n", __func__, n_task);
      std::vector<uint32_t> task_pos(n_task);
      strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
      if (strstream.fail()) {
-        printf("%s: failed to read task positions from prompt\n", __func__);
+        LOG_ERR("%s: failed to read task positions from prompt\n", __func__);
          return;
      }
  
@@ -1425,21 +1426,21 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
      if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) {
          // Use all tasks
          tasks.resize(n_task);
-        printf("%s: reading tasks", __func__);
+        LOG_INF("%s: reading tasks", __func__);
          int n_dot = std::max((int) n_task/100, 1);
          int i = 0;
          for (auto& task : tasks) {
              ++i;
              if (!task.deserialize(strstream)) {
-                printf("%s: failed to read task %d of %u\n", __func__, i, n_task);
+                LOG_ERR("%s: failed to read task %d of %u\n", __func__, i, n_task);
                  return;
              }
-            if (i%n_dot == 0) printf(".");
+            if (i%n_dot == 0) LOG(".");
          }
-        printf("done\n");
+        LOG("done\n");
      }
      else {
-        printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
+        LOG_INF("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
          std::mt19937 rng(1);
          std::vector<int> aux(n_task);
          for (uint32_t i = 0; i < n_task; ++i) aux[i] = i;
@@ -1452,18 +1453,16 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
              aux.pop_back();
              strstream.seekg(task_pos[idx], std::ios::beg);
              if (!task.deserialize(strstream)) {
-                printf("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
+                LOG_ERR("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
                  return;
              }
          }
          n_task = params.multiple_choice_tasks;
      }
  
-    printf("%s: preparing task data", __func__);
-    fflush(stdout);
+    LOG_INF("%s: preparing task data", __func__);
      if (n_task > 500) {
-        printf("...");
-        fflush(stdout);
+        LOG("...");
          std::atomic<int> counter(0);
          std::atomic<int> n_bad(0);
          auto prepare = [&counter, &n_bad, &tasks, ctx] () {
@@ -1487,11 +1486,10 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
          for (auto& w : workers) w = std::thread(prepare);
          prepare();
          for (auto& w : workers) w.join();
-        printf("done\n");
-        fflush(stdout);
+        LOG("done\n");
          int nbad = n_bad;
          if (nbad > 0) {
-            printf("%s: found %d malformed tasks\n", __func__, nbad);
+            LOG_ERR("%s: found %d malformed tasks\n", __func__, nbad);
              return;
          }
      } else {
@@ -1503,16 +1501,15 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
                  return;
              }
              if (i_task%n_dot == 0) {
-                printf(".");
-                fflush(stdout);
+                LOG(".");
              }
          }
-        printf("done\n");
+        LOG("done\n");
      }
  
-    printf("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
+    LOG_INF("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
  
-    printf("\ntask\tacc_norm\n");
+    LOG("\ntask\tacc_norm\n");
  
      const int n_vocab = llama_n_vocab(llama_get_model(ctx));
      const int n_ctx   = llama_n_ctx(ctx);
@@ -1591,7 +1588,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
          }
  
          if (i0 == i1) {
-            fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
+            LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
              return;
          }
  
@@ -1599,7 +1596,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
  
          // decode all tasks [i0, i1)
          if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
-            fprintf(stderr, "%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
              return;
          }
  
@@ -1623,13 +1620,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
          // compute the logprobs for each ending of the decoded tasks
          for (size_t i = i0; i < i1; ++i) {
              auto & cur_task = tasks[i];
-            //printf("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
+            //LOG("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
              //for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) {
              //    if (cur_task.mc1.labels[j] == 1) {
-            //        printf("%d", j+1);
+            //        LOG("%d", j+1);
              //    }
              //}
-            //printf("\n    common_prefix: %zu\n", cur_task.common_prefix);
+            //LOG("\n    common_prefix: %zu\n", cur_task.common_prefix);
  
              // get the logits of the last token of the common prefix
              std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float));
@@ -1641,13 +1638,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
                  size_t count = 1;
                  float  log_prob  = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]);
                  for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
-                    //printf("        %zu  %g\n", ir, eval_results[ir]);
+                    //LOG("        %zu  %g\n", ir, eval_results[ir]);
                      ++count;
                      log_prob += eval_results[ir++];
                  }
                  cur_task.log_probs[s] = log_prob / count;
-                //printf("        Final: %g\n", log_prob / count);
-                //printf("    <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
+                //LOG("        Final: %g\n", log_prob / count);
+                //LOG("    <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
              }
  
              // Find the ending with maximum logprob
@@ -1667,8 +1664,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
              ++n_done;
  
              // Print the accumulated accuracy mean x 100
-            printf("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
-            fflush(stdout);
+            LOG("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
          }
  
          i0 = i1 - 1;
@@ -1680,29 +1676,30 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
  
      float p = 1.f*n_correct/n_done;
      float sigma = sqrt(p*(1-p)/(n_done-1));
-    printf("\n Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
+    LOG("\n");
+    LOG_INF("Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
      p = 1.f*n_done/n_tot_answers;
      sigma = sqrt(p*(1-p)/(n_done-1));
-    printf("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
+    LOG_INF("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
  
-    printf("\n");
+    LOG_INF("\n");
  }
  
  static void kl_divergence(llama_context * ctx, const gpt_params & params) {
      if (params.logits_file.empty()) {
-        fprintf(stderr, "%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
+        LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
          return;
      }
      std::ifstream in(params.logits_file.c_str(), std::ios::binary);
      if (!in) {
-        fprintf(stderr, "%s: failed to open %s\n", __func__, params.logits_file.c_str());
+        LOG_ERR("%s: failed to open %s\n", __func__, params.logits_file.c_str());
          return;
      }
      {
          char check[9]; check[8] = 0;
          in.read(check, 8);
          if (in.fail() || strncmp("_logits_", check, 8) != 0) {
-            fprintf(stderr, "%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
+            LOG_ERR("%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
              return;
          }
      }
@@ -1710,7 +1707,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
      uint32_t n_ctx;
      in.read((char *)&n_ctx, sizeof(n_ctx));
      if (n_ctx > llama_n_ctx(ctx)) {
-        fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
+        LOG_ERR("%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
                  __func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
      }
  
@@ -1718,16 +1715,16 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
      in.read((char *)&n_vocab, sizeof(n_vocab));
      in.read((char *)&n_chunk, sizeof(n_chunk));
      if (in.fail()) {
-        fprintf(stderr, "%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
+        LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
          return;
      }
      if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
-        fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
+        LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
      }
  
      std::vector<llama_token> tokens(n_ctx * n_chunk);
      if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
-        fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
+        LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
          return;
      }
  
@@ -1776,7 +1773,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
          const auto t_start = std::chrono::high_resolution_clock::now();
  
          if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
-            fprintf(stderr, "%s: failed reading log-probs for chunk %d\n", __func__, i);
+            LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i);
              return;
          }
  
@@ -1797,7 +1794,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
  
              // TODO: use llama_batch.logits instead of relying on logits_all == true
              if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
-                fprintf(stderr, "%s : failed to eval\n", __func__);
+                LOG_ERR("%s : failed to eval\n", __func__);
                  return;
              }
  
@@ -1814,16 +1811,16 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
  
          if (i == 0) {
              const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
              int total_seconds = (int)(t_total * n_chunk);
              if (total_seconds >= 60*60) {
-                fprintf(stderr, "%d hours ", total_seconds / (60*60));
+                LOG("%d hours ", total_seconds / (60*60));
                  total_seconds = total_seconds % (60*60);
              }
-            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
-
-            printf("\nchunk             PPL               ln(PPL(Q)/PPL(base))          KL Divergence              Δp RMS            Same top p\n");
+            LOG("%.2f minutes\n", total_seconds / 60.0);
          }
+        LOG("\n");
+        LOG("chunk             PPL               ln(PPL(Q)/PPL(base))          KL Divergence              Δp RMS            Same top p\n");
  
          const int first = n_ctx/2;
          const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
@@ -1832,79 +1829,77 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
          p_diff_ptr += n_ctx - 1 - first;
          kld_ptr    += n_ctx - 1 - first;
  
-        printf("%4d", i+1);
+        LOG("%4d", i+1);
  
          auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
          const double ppl_val = exp(log_ppl.first);
          const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
-        printf("    %9.4lf ± %9.4lf", ppl_val, ppl_unc);
+        LOG("    %9.4lf ± %9.4lf", ppl_val, ppl_unc);
  
          auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
          const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
          const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
          const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
-        printf("    %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
+        LOG("    %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
  
          auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
-        printf("    %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
+        LOG("    %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
  
          auto p_diff_mse   = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
          const double p_diff_rms_val = sqrt(p_diff_mse.first);
          const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
-        printf("    %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
+        LOG("    %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
  
          double p_top_val = 1.*kld.n_same_top/kld.count;
          double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
-        printf("    %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
+        LOG("    %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
  
-        printf("\n");
-
-        fflush(stdout);
+        LOG("\n");
  
          logits.clear();
      }
-    printf("\n");
+    LOG("\n");
  
      if (kld.count < 100) return; // we do not wish to do statistics on so few values
  
      std::sort(kld_values.begin(), kld_values.end());
      std::sort(p_diff_values.begin(), p_diff_values.end());
  
-    printf("====== Perplexity statistics ======\n");
+    LOG("====== Perplexity statistics ======\n");
  
      auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
      const double ppl_val = exp(log_ppl.first);
      const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
-    printf("Mean PPL(Q)                   : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
+    LOG("Mean PPL(Q)                   : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
  
      auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
      const double ppl_base_val = exp(log_ppl_base.first);
      const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 )
-    printf("Mean PPL(base)                : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
+    LOG("Mean PPL(base)                : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
  
      const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
-    // printf("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
+    // LOG("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
      const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second);
-    printf("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
+    LOG("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
  
      const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
      const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
-    printf("Mean ln(PPL(Q)/PPL(base))     : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
+    LOG("Mean ln(PPL(Q)/PPL(base))     : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
  
      const double ppl_ratio_val = exp(log_ppl_ratio_val);
      const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 )
-    printf("Mean PPL(Q)/PPL(base)         : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
+    LOG("Mean PPL(Q)/PPL(base)         : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
  
      const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov;
      const double ppl_diff_val = ppl_val - ppl_base_val;
      const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov);
-    printf("Mean PPL(Q)-PPL(base)         : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
+    LOG("Mean PPL(Q)-PPL(base)         : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
  
-    printf("\n");
+    LOG("\n");
  
-    printf("====== KL divergence statistics ======\n");
+    LOG("====== KL divergence statistics ======\n");
      auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
-    printf("Mean    KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
+    LOG("Mean    KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
      auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
                                                 : kld_values[kld_values.size()/2];
  
@@ -1916,50 +1911,49 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
          return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)];
      };
  
-    printf("Maximum KLD: %10.6f\n", kld_values.back());
-    printf("99.9%%   KLD: %10.6f\n", percentile(kld_values, 0.999f));
-    printf("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
-    printf("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
-    printf("Median  KLD: %10.6f\n", kld_median);
-    printf("10.0%%   KLD: %10.6f\n", percentile(kld_values, 0.100f));
-    printf(" 5.0%%   KLD: %10.6f\n", percentile(kld_values, 0.050f));
-    printf(" 1.0%%   KLD: %10.6f\n", percentile(kld_values, 0.010f));
-    printf("Minimum KLD: %10.6f\n", kld_values.front());
+    LOG("Maximum KLD: %10.6f\n", kld_values.back());
+    LOG("99.9%%   KLD: %10.6f\n", percentile(kld_values, 0.999f));
+    LOG("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
+    LOG("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
+    LOG("Median  KLD: %10.6f\n", kld_median);
+    LOG("10.0%%   KLD: %10.6f\n", percentile(kld_values, 0.100f));
+    LOG(" 5.0%%   KLD: %10.6f\n", percentile(kld_values, 0.050f));
+    LOG(" 1.0%%   KLD: %10.6f\n", percentile(kld_values, 0.010f));
+    LOG("Minimum KLD: %10.6f\n", kld_values.front());
  
-    printf("\n");
+    LOG("\n");
  
-    printf("====== Token probability statistics ======\n");
+    LOG("====== Token probability statistics ======\n");
  
      auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count);
-    printf("Mean    Δp: %6.3lf ± %5.3lf %%\n",  100.0*p_diff.first, 100.0*p_diff.second);
+    LOG("Mean    Δp: %6.3lf ± %5.3lf %%\n",  100.0*p_diff.first, 100.0*p_diff.second);
  
      auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1])
                                                 : p_diff_values[p_diff_values.size()/2];
  
-    printf("Maximum Δp: %6.3lf%%\n",  100.0*p_diff_values.back());
-    printf("99.9%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
-    printf("99.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
-    printf("95.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
-    printf("90.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
-    printf("75.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
-    printf("Median  Δp: %6.3lf%%\n",  100.0*p_diff_median);
-    printf("25.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
-    printf("10.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
-    printf(" 5.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
-    printf(" 1.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
-    printf(" 0.1%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
-    printf("Minimum Δp: %6.3lf%%\n",  100.0*p_diff_values.front());
+    LOG("Maximum Δp: %6.3lf%%\n",  100.0*p_diff_values.back());
+    LOG("99.9%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
+    LOG("99.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
+    LOG("95.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
+    LOG("90.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
+    LOG("75.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
+    LOG("Median  Δp: %6.3lf%%\n",  100.0*p_diff_median);
+    LOG("25.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
+    LOG("10.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
+    LOG(" 5.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
+    LOG(" 1.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
+    LOG(" 0.1%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
+    LOG("Minimum Δp: %6.3lf%%\n",  100.0*p_diff_values.front());
  
      auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
-    // printf("MSE Δp    : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
+    // LOG("MSE Δp    : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
  
      const double p_diff_rms_val = sqrt(p_diff_mse.first);
      const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
-    printf("RMS Δp    : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
+    LOG("RMS Δp    : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
  
      const double same_top_p = 1.0*kld.n_same_top/kld.count;
-    printf("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
-
+    LOG("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
  }
  
  int main(int argc, char ** argv) {
@@ -1972,10 +1966,12 @@ int main(int argc, char ** argv) {
          return 1;
      }
  
+    gpt_init();
+
      const int32_t n_ctx = params.n_ctx;
  
      if (n_ctx <= 0) {
-        fprintf(stderr, "%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
+        LOG_ERR("%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
          return 1;
      }
  
@@ -2000,13 +1996,11 @@ int main(int argc, char ** argv) {
      }
  
      if (params.ppl_stride > 0) {
-        fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
+        LOG_INF("Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
                  params.n_ctx, params.n_ctx + params.ppl_stride/2);
          params.n_ctx += params.ppl_stride/2;
      }
  
-    print_build_info();
-
      llama_backend_init();
      llama_numa_init(params.numa);
  
@@ -2016,21 +2010,21 @@ int main(int argc, char ** argv) {
      llama_model * model = llama_init.model;
      llama_context * ctx = llama_init.context;
      if (model == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        LOG_ERR("%s: unable to load model\n", __func__);
          return 1;
      }
  
      const int n_ctx_train = llama_n_ctx_train(model);
  
      if (params.n_ctx > n_ctx_train) {
-        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
+        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
                  __func__, n_ctx_train, params.n_ctx);
      }
  
      // print system information
      {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
      }
  
      struct results_perplexity results;
@@ -2046,8 +2040,9 @@ int main(int argc, char ** argv) {
          results = perplexity(ctx, params, n_ctx);
      }
  
-    LOG_TEE("\n");
+    LOG("\n");
      llama_perf_context_print(ctx);
+
      write_logfile(ctx, params, model, results);
  
      llama_free(ctx);
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp

index d08679edb3d14d70a4c791b456be89f953b8f4cf..5971690f152452e0d5419f494ef8116fa3c91461 100644 (file)
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -1,14 +1,16 @@
  #include "arg.h"
  #include "common.h"
+#include "log.h"
  #include "llama.h"
  
  #include <algorithm>
  #include <fstream>
+#include <iostream> // TODO: remove me
  
  static void print_usage(int, char ** argv) {
-    LOG_TEE("\nexample usage:\n");
-    LOG_TEE("\n    %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
-    LOG_TEE("\n");
+    LOG("\nexample usage:\n");
+    LOG("\n    %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
+    LOG("\n");
  }
  
  struct chunk {
@@ -17,7 +19,7 @@ struct chunk {
      // original file position
      size_t filepos;
      // original text data
-    std::string textdata = "";
+    std::string textdata;
      // tokenized text data
      std::vector<llama_token> tokens;
      // embedding
@@ -31,14 +33,14 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
      std::ifstream f(filename.c_str());
  
      if (!f.is_open()) {
-        fprintf(stderr, "Error: could not open file %s\n", filename.c_str());
+        LOG_ERR("could not open file %s\n", filename.c_str());
          return chunks;
      }
  
      chunk current_chunk;
      char buffer[1024];
      int64_t filepos = 0;
-    std::string current = "";
+    std::string current;
      while (f.read(buffer, 1024)) {
          current += std::string(buffer, f.gcount());
          size_t pos;
@@ -84,9 +86,9 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
      llama_kv_cache_clear(ctx);
  
      // run model
-    fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
+    LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
      if (llama_decode(ctx, batch) < 0) {
-        fprintf(stderr, "%s : failed to decode\n", __func__);
+        LOG_ERR("%s : failed to decode\n", __func__);
      }
  
      for (int i = 0; i < batch.n_tokens; i++) {
@@ -99,7 +101,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
          if (embd == NULL) {
              embd = llama_get_embeddings_ith(ctx, i);
              if (embd == NULL) {
-                fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
+                LOG_ERR("%s: failed to get embeddings for token %d\n", __func__, i);
                  continue;
              }
          }
@@ -116,24 +118,24 @@ int main(int argc, char ** argv) {
          return 1;
      }
  
+    gpt_init();
+
      // For BERT models, batch size must be equal to ubatch size
      params.n_ubatch = params.n_batch;
      params.embedding = true;
  
      if (params.chunk_size <= 0) {
-        fprintf(stderr, "chunk_size must be positive\n");
+        LOG_ERR("chunk_size must be positive\n");
          return 1;
      }
      if (params.context_files.empty()) {
-        fprintf(stderr, "context_files must be specified\n");
+        LOG_ERR("context_files must be specified\n");
          return 1;
      }
  
-    print_build_info();
-
-    printf("processing files:\n");
+    LOG_INF("processing files:\n");
      for (auto & context_file : params.context_files) {
-        printf("%s\n", context_file.c_str());
+        LOG_INF("%s\n", context_file.c_str());
      }
  
      std::vector<chunk> chunks;
@@ -141,7 +143,7 @@ int main(int argc, char ** argv) {
          std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
          chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
      }
-    printf("Number of chunks: %ld\n", chunks.size());
+    LOG_INF("Number of chunks: %ld\n", chunks.size());
  
      llama_backend_init();
      llama_numa_init(params.numa);
@@ -153,7 +155,7 @@ int main(int argc, char ** argv) {
      llama_context * ctx = llama_init.context;
  
      if (model == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        LOG_ERR("%s: unable to load model\n", __func__);
          return 1;
      }
  
@@ -162,19 +164,19 @@ int main(int argc, char ** argv) {
  
      const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
      if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
-        fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
+        LOG_ERR("%s: pooling type NONE not supported\n", __func__);
          return 1;
      }
  
      if (n_ctx > n_ctx_train) {
-        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
+        LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
                  __func__, n_ctx_train, n_ctx);
      }
  
      // print system information
      {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
      }
  
      // max batch size
@@ -185,7 +187,7 @@ int main(int argc, char ** argv) {
      for (auto & chunk : chunks) {
          auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false);
          if (inp.size() > n_batch) {
-            fprintf(stderr, "%s: error: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
+            LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
                      __func__, (long long int) inp.size(), (long long int) n_batch);
              return 1;
          }
@@ -199,12 +201,12 @@ int main(int argc, char ** argv) {
      // tokenization stats
      if (params.verbose_prompt) {
          for (int i = 0; i < (int) chunks.size(); i++) {
-            fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
-            fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
+            LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
+            LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
              for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
-                fprintf(stderr, "%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
+                LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
              }
-            fprintf(stderr, "\n\n");
+            LOG_INF("\n\n");
          }
      }
  
@@ -256,7 +258,7 @@ int main(int argc, char ** argv) {
      // start loop, receive query and return top k similar chunks based on cosine similarity
      std::string query;
      while (true) {
-        printf("Enter query: ");
+        LOG("Enter query: ");
          std::getline(std::cin, query);
          std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
  
@@ -280,18 +282,18 @@ int main(int argc, char ** argv) {
                  return a.second > b.second;
              });
  
-            printf("Top %d similar chunks:\n", params.sparams.top_k);
+            LOG("Top %d similar chunks:\n", params.sparams.top_k);
              for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) {
-                printf("filename: %s\n", chunks[similarities[i].first].filename.c_str());
-                printf("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
-                printf("similarity: %f\n", similarities[i].second);
-                printf("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
-                printf("--------------------\n");
+                LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str());
+                LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
+                LOG("similarity: %f\n", similarities[i].second);
+                LOG("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
+                LOG("--------------------\n");
              }
          }
      }
  
-    LOG_TEE("\n");
+    LOG("\n");
      llama_perf_context_print(ctx);
  
      // clean up
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt

index 580f3a8248cf5715cd761ac9d642be81d97a1c64..3e717e882b4bf95fc914358f1c710dc2c29d0449 100644 (file)
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -1,6 +1,6 @@
  set(TARGET llama-server)
-option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
-option(LLAMA_SERVER_SSL     "Build SSL support for the server"        OFF)
+
+option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
  
  include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
  
@@ -46,9 +46,6 @@ endforeach()
  
  add_executable(${TARGET} ${TARGET_SRCS})
  install(TARGETS ${TARGET} RUNTIME)
-target_compile_definitions(${TARGET} PRIVATE
-    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
-)
  
  target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
  
diff --git a/examples/server/README.md b/examples/server/README.md

index 44a73ca0a10c20ce31022e4c7a29ed62121b6331..168e14a9908d8230e82ba9e1f57be87b2b8f5f9e 100644 (file)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -121,7 +121,6 @@ The project is under active development, and we are [looking for feedback and co
  | `-to, --timeout N` | server read/write timeout in seconds (default: 600) |
  | `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
  | `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
-| `--log-format {text, json}` | log output format: json or text (default: json) |
  | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
  | `--no-slots` | disables slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
  | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md

index 0f18ca39651d2cd8dd333a7bf340d88821a27395..353368e13b0c8b742e08fd1f70f2006dde1e9fd1 100644 (file)
--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@@ -40,7 +40,6 @@ server --host localhost --port 8080 \
    --parallel 8 \
    --batch-size 512 \
    --ctx-size 4096 \
-  --log-format text \
    -ngl 33
  ```
  
diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py

index 2daac08847d6556e57e5eddb9e220fbd267e74a6..a9ed747f51db55763d228f973ae252dd98e7325a 100644 (file)
--- a/examples/server/bench/bench.py
+++ b/examples/server/bench/bench.py
@@ -272,7 +272,6 @@ def start_server_background(args):
      server_args.append('--cont-batching')
      server_args.append('--metrics')
      server_args.append('--flash-attn')
-    server_args.extend(['--log-format', "text"])
      args = [str(arg) for arg in [server_path, *server_args]]
      print(f"bench: starting server with: {' '.join(args)}")
      pkwargs = {
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index 14c4af3d928fee51411d927d0261e75e2413d498..b5f264ff119fb8c156cc82a9f9a545c90ef93207 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2,6 +2,7 @@
  
  #include "arg.h"
  #include "common.h"
+#include "log.h"
  #include "sampling.h"
  #include "json-schema-to-grammar.h"
  #include "llama.h"
@@ -31,21 +32,33 @@
  #include "loading.html.hpp"
  
  #include <atomic>
-#include <chrono>
  #include <condition_variable>
  #include <cstddef>
+#include <cinttypes>
+#include <deque>
+#include <memory>
  #include <mutex>
-#include <thread>
  #include <signal.h>
-#include <memory>
-#include <unordered_set>
+#include <thread>
  #include <unordered_map>
-#include <deque>
+#include <unordered_set>
  
-using json = nlohmann::ordered_json;
+#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
+#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
+#define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
+#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
  
-bool server_verbose = false;
-bool server_log_json = true;
+#define SRV_INF(fmt, ...) LOG_INF("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SRV_WRN(fmt, ...) LOG_WRN("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SRV_ERR(fmt, ...) LOG_ERR("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SRV_DBG(fmt, ...) LOG_DBG("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+
+#define QUE_INF(fmt, ...) LOG_INF("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define QUE_WRN(fmt, ...) LOG_WRN("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define QUE_ERR(fmt, ...) LOG_ERR("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define QUE_DBG(fmt, ...) LOG_DBG("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+
+using json = nlohmann::ordered_json;
  
  enum stop_type {
      STOP_TYPE_FULL,
@@ -197,6 +210,8 @@ struct server_slot {
      std::function<void(int)> callback_on_release;
  
      void reset() {
+        SLT_DBG(*this, "%s", "\n");
+
          n_prompt_tokens    = 0;
          generated_text     = "";
          truncated          = false;
@@ -234,8 +249,9 @@ struct server_slot {
          return state != SLOT_STATE_IDLE;
      }
  
-    void add_token_string(const completion_token_output & token) {
+    void add_token(const completion_token_output & token) {
          if (!is_processing()) {
+            SLT_WRN(*this, "%s", "slot is not processing\n");
              return;
          }
          generated_token_probs.push_back(token);
@@ -243,14 +259,10 @@ struct server_slot {
  
      void release() {
          if (is_processing()) {
+            SLT_INF(*this, "stop processing: n_past = %d, truncated = %d\n", n_past, truncated);
+
              t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
              state = SLOT_STATE_IDLE;
-            LOG_INFO("slot released", {
-                {"id_slot",   id},
-                {"id_task",   id_task},
-                {"n_past",    n_past},
-                {"truncated", truncated},
-            });
              callback_on_release(id);
          }
      }
@@ -298,49 +310,20 @@ struct server_slot {
      }
  
      void print_timings() const {
-        char buffer[512];
-
-        double t_token = t_prompt_processing / n_prompt_tokens_processed;
-        double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
-
-        snprintf(buffer, 512, "prompt eval time     = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
-                t_prompt_processing, n_prompt_tokens_processed,
-                t_token, n_tokens_second);
-
-        LOG_INFO(buffer, {
-            {"id_slot",                   id},
-            {"id_task",                   id_task},
-            {"t_prompt_processing",       t_prompt_processing},
-            {"n_prompt_tokens_processed", n_prompt_tokens_processed},
-            {"t_token",                   t_token},
-            {"n_tokens_second",           n_tokens_second},
-        });
-
-        t_token = t_token_generation / n_decoded;
-        n_tokens_second = 1e3 / t_token_generation * n_decoded;
-
-        snprintf(buffer, 512, "generation eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)",
-                t_token_generation, n_decoded,
-                t_token, n_tokens_second);
-
-        LOG_INFO(buffer, {
-            {"id_slot",            id},
-            {"id_task",            id_task},
-            {"t_token_generation", t_token_generation},
-            {"n_decoded",          n_decoded},
-            {"t_token",            t_token},
-            {"n_tokens_second",    n_tokens_second},
-        });
-
-        snprintf(buffer, 512, "          total time = %10.2f ms", t_prompt_processing + t_token_generation);
-
-        LOG_INFO(buffer, {
-            {"id_slot",             id},
-            {"id_task",             id_task},
-            {"t_prompt_processing", t_prompt_processing},
-            {"t_token_generation",  t_token_generation},
-            {"t_total",             t_prompt_processing + t_token_generation},
-        });
+        const double t_prompt        =       t_prompt_processing / n_prompt_tokens_processed;
+        const double n_prompt_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
+
+        const double t_gen        =       t_token_generation / n_decoded;
+        const double n_gen_second = 1e3 / t_token_generation * n_decoded;
+
+        SLT_INF(*this,
+                "\n"
+                "\rprompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
+                "\r       eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
+                "\r      total time = %10.2f ms / %5d tokens\n",
+                t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
+                t_token_generation, n_decoded, t_gen, n_gen_second,
+                t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
      }
  };
  
@@ -416,8 +399,8 @@ struct server_queue {
          std::unique_lock<std::mutex> lock(mutex_tasks);
          if (task.id == -1) {
              task.id = id++;
-            LOG_VERBOSE("new task id", {{"new_id", task.id}});
          }
+        QUE_DBG("new task, id = %d, front = %d\n", task.id, front);
          if (front) {
              queue_tasks.push_front(std::move(task));
          } else {
@@ -433,8 +416,8 @@ struct server_queue {
          for (auto & task : tasks) {
              if (task.id == -1) {
                  task.id = id++;
-                LOG_VERBOSE("new task id", {{"new_id", task.id}});
              }
+            QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front);
              if (front) {
                  queue_tasks.push_front(std::move(task));
              } else {
@@ -448,6 +431,7 @@ struct server_queue {
      // Add a new task, but defer until one slot is available
      void defer(server_task task) {
          std::unique_lock<std::mutex> lock(mutex_tasks);
+        QUE_DBG("defer task, id = %d\n", task.id);
          queue_tasks_deferred.push_back(std::move(task));
          condition_tasks.notify_one();
      }
@@ -456,7 +440,6 @@ struct server_queue {
      int get_new_id() {
          std::unique_lock<std::mutex> lock(mutex_tasks);
          int new_id = id++;
-        LOG_VERBOSE("new task id", {{"new_id", new_id}});
          return new_id;
      }
  
@@ -498,7 +481,7 @@ struct server_queue {
          running = true;
  
          while (true) {
-            LOG_VERBOSE("new task may arrive", {});
+            QUE_DBG("%s", "processing new tasks\n");
  
              while (true) {
                  std::unique_lock<std::mutex> lock(mutex_tasks);
@@ -509,21 +492,22 @@ struct server_queue {
                  server_task task = queue_tasks.front();
                  queue_tasks.pop_front();
                  lock.unlock();
-                LOG_VERBOSE("callback_new_task", {{"id_task", task.id}});
+
+                QUE_DBG("processing task, id = %d\n", task.id);
                  callback_new_task(task);
              }
  
              // all tasks in the current loop is processed, slots data is now ready
-            LOG_VERBOSE("callback_update_slots", {});
+            QUE_DBG("%s", "update slots\n");
  
              callback_update_slots();
  
-            LOG_VERBOSE("wait for new task", {});
+            QUE_DBG("%s", "waiting for new tasks\n");
              {
                  std::unique_lock<std::mutex> lock(mutex_tasks);
                  if (queue_tasks.empty()) {
                      if (!running) {
-                        LOG_VERBOSE("ending start_loop", {});
+                        QUE_DBG("%s", "terminate\n");
                          return;
                      }
                      condition_tasks.wait(lock, [&]{
@@ -547,7 +531,7 @@ struct server_response {
  
      // add the id_task to the list of tasks waiting for response
      void add_waiting_task_id(int id_task) {
-        LOG_VERBOSE("waiting for task id", {{"id_task", id_task}});
+        SRV_DBG("waiting for task id = %d\n", id_task);
  
          std::unique_lock<std::mutex> lock(mutex_results);
          waiting_task_ids.insert(id_task);
@@ -561,7 +545,7 @@ struct server_response {
  
      // when the request is finished, we can remove task associated with it
      void remove_waiting_task_id(int id_task) {
-        LOG_VERBOSE("remove waiting for task id", {{"id_task", id_task}});
+        SRV_DBG("task id = %d is done\n", id_task);
  
          std::unique_lock<std::mutex> lock(mutex_results);
          waiting_task_ids.erase(id_task);
@@ -595,12 +579,13 @@ struct server_response {
  
      // Send a new result to a waiting id_task
      void send(server_task_result & result) {
-        LOG_VERBOSE("send new result", {{"id_task", result.id}});
+        SRV_DBG("sending result for task id = %d\n", result.id);
  
          std::unique_lock<std::mutex> lock(mutex_results);
          for (const auto & id_task : waiting_task_ids) {
              if (result.id == id_task) {
-                LOG_VERBOSE("queue_results.push_back", {{"id_task", id_task}});
+                SRV_DBG("task id = %d moved to result queue\n", result.id);
+
                  queue_results.push_back(std::move(result));
                  condition_results.notify_all();
                  return;
@@ -612,7 +597,7 @@ struct server_response {
  struct server_context {
      llama_model * model = nullptr;
      llama_context * ctx = nullptr;
-    std::vector<llama_lora_adapter_container> lora_adapters;
+    std::vector<llama_lora_adapter_container> loras;
  
      gpt_params params;
  
@@ -672,11 +657,13 @@ struct server_context {
          llama_init_result llama_init = llama_init_from_gpt_params(params);
  
          model = llama_init.model;
-        ctx = llama_init.context;
-        lora_adapters = llama_init.lora_adapters;
+        ctx   = llama_init.context;
+        loras = llama_init.lora_adapters;
+
          params.n_parallel -= 1; // but be sneaky about it
+
          if (model == nullptr) {
-            LOG_ERROR("unable to load model", {{"model", params.model}});
+            SRV_ERR("failed to load model, '%s'\n", params.model.c_str());
              return false;
          }
  
@@ -699,7 +686,7 @@ struct server_context {
      void init() {
          const int32_t n_ctx_slot = n_ctx / params.n_parallel;
  
-        LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}});
+        SRV_INF("initializing slots, n_slots = %d\n", params.n_parallel);
  
          for (int i = 0; i < params.n_parallel; i++) {
              server_slot slot;
@@ -708,10 +695,7 @@ struct server_context {
              slot.n_ctx = n_ctx_slot;
              slot.n_predict = params.n_predict;
  
-            LOG_INFO("new slot", {
-                {"id_slot",    slot.id},
-                {"n_ctx_slot", slot.n_ctx}
-            });
+            SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
  
              const int ga_n = params.grp_attn_n;
              const int ga_w = params.grp_attn_w;
@@ -722,11 +706,7 @@ struct server_context {
                  //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of ga_w");    // NOLINT
                  //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
  
-                LOG_INFO("slot self-extend", {
-                    {"id_slot", slot.id},
-                    {"ga_n",    ga_n},
-                    {"ga_w",    ga_w}
-                });
+                SLT_INF(slot, "slot self-extend: ga_n = %d, ga_w = %d\n", ga_n, ga_w);
              }
  
              slot.ga_i = 0;
@@ -849,11 +829,7 @@ struct server_context {
              }
  
              if (ret != nullptr) {
-                LOG_VERBOSE("selected slot by lcp similarity", {
-                    {"id_slot", ret->id},
-                    {"max_lcp_len", max_lcp_len},
-                    {"similarity", similarity},
-                });
+                SLT_DBG(*ret, "selected slot by lcp similarity, max_lcp_len = %d, similarity = %f\n", max_lcp_len, similarity);
              }
          }
  
@@ -874,10 +850,7 @@ struct server_context {
              }
  
              if (ret != nullptr) {
-                LOG_VERBOSE("selected slot by lru", {
-                    {"id_slot", ret->id},
-                    {"t_last", t_last},
-                });
+                SLT_DBG(*ret, "selected slot by lru, t_last = %" PRId64 "\n", t_last);
              }
          }
  
@@ -941,17 +914,14 @@ struct server_context {
          }
  
          if (slot.params.cache_prompt && slot.ga_n != 1) {
-            LOG_WARNING("cache_prompt is not supported with group-attention", {});
              slot.params.cache_prompt = false;
+            SLT_WRN(slot, "%s", "group-attention is not supported with prompt caching. disabling cache\n");
          }
  
          if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
              // Might be better to reject the request with a 400 ?
-            LOG_WARNING("Max tokens to predict exceeds server configuration", {
-                {"params.n_predict", slot.params.n_predict},
-                {"slot.n_predict",   slot.n_predict},
-            });
              slot.params.n_predict = slot.n_predict;
+            SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.n_predict, slot.n_predict);
          }
  
          // infill
@@ -1060,16 +1030,13 @@ struct server_context {
          slot.state = SLOT_STATE_PROCESSING_PROMPT;
          slot.prompt_tokens.clear();
  
-        LOG_INFO("slot is processing task", {
-            {"id_slot", slot.id},
-            {"id_task", slot.id_task},
-        });
+        SLT_INF(slot, "%s", "processing task\n");
  
          return true;
      }
  
      void kv_cache_clear() {
-        LOG_VERBOSE("clearing KV cache", {});
+        SRV_DBG("%s", "clearing KV cache\n");
  
          // clear the entire KV cache
          llama_kv_cache_clear(ctx);
@@ -1077,9 +1044,7 @@ struct server_context {
      }
  
      void system_prompt_update() {
-        LOG_VERBOSE("system prompt update", {
-            {"system_prompt", system_prompt},
-        });
+        SRV_DBG("updating system prompt: '%s'\n", system_prompt.c_str());
  
          kv_cache_clear();
          system_tokens.clear();
@@ -1100,7 +1065,7 @@ struct server_context {
                  }
  
                  if (llama_decode(ctx, batch) != 0) {
-                    LOG_ERROR("llama_decode() failed", {});
+                    SRV_ERR("%s", "llama_decode() failed\n");
                      return;
                  }
              }
@@ -1115,11 +1080,9 @@ struct server_context {
      }
  
      bool system_prompt_set(const std::string & sys_prompt) {
-        system_prompt = sys_prompt;
+        SRV_DBG("system prompt set: '%s'\n", system_prompt.c_str());
  
-        LOG_VERBOSE("system prompt process", {
-            {"system_prompt",  system_prompt},
-        });
+        system_prompt = sys_prompt;
  
          // release all slots
          for (server_slot & slot : slots) {
@@ -1187,7 +1150,7 @@ struct server_context {
                  // add the token to slot queue and cache
              }
  
-            slot.add_token_string(result);
+            slot.add_token(result);
              if (slot.params.stream) {
                  send_partial_response(slot, result);
              }
@@ -1202,55 +1165,30 @@ struct server_context {
              slot.stopped_limit  = true;
              slot.has_next_token = false;
  
-            LOG_VERBOSE("stopped by limit", {
-                {"id_slot",   slot.id},
-                {"id_task",   slot.id_task},
-                {"n_decoded", slot.n_decoded},
-                {"n_predict", slot.params.n_predict},
-            });
+            SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict);
          }
  
          if (llama_token_is_eog(model, result.tok)) {
              slot.stopped_eos    = true;
              slot.has_next_token = false;
  
-            LOG_VERBOSE("eos token found", {});
-        }
-
-        auto n_ctx_train = llama_n_ctx_train(model);
-        if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1
-                    && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
-            LOG_WARNING("n_predict is not set and self-context extend is disabled."
-                        " Limiting generated tokens to n_ctx_train to avoid EOS-less generation infinite loop", {
-                    { "id_slot",              slot.id },
-                    { "params.n_predict",     slot.params.n_predict },
-                    { "slot.n_prompt_tokens", slot.n_prompt_tokens },
-                    { "slot.n_decoded",       slot.n_decoded },
-                    { "slot.n_predict",       slot.n_predict },
-                    { "n_slots",              params.n_parallel },
-                    { "slot.n_ctx",           slot.n_ctx },
-                    { "n_ctx",                n_ctx },
-                    { "n_ctx_train",          n_ctx_train },
-                    { "ga_n",                 slot.ga_n },
-                });
+            SLT_DBG(slot, "%s", "stopped by EOS\n");
+        }
+
+        const auto n_ctx_train = llama_n_ctx_train(model);
+
+        if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
              slot.truncated      = true;
              slot.stopped_limit  = true;
              slot.has_next_token = false; // stop prediction
+
+            SLT_WRN(slot,
+                    "n_predict (%d) is not set and self-context extend is disabled. "
+                    "Limiting generated tokens to n_ctx_train (%d) to avoid EOS-less generation infinite loop\n",
+                    slot.params.n_predict, n_ctx_train);
          }
  
-        LOG_VERBOSE("next token", {
-            {"id_slot",        slot.id},
-            {"id_task",        slot.id_task},
-            {"token",          result.tok},
-            {"token_text",     tokens_to_output_formatted_string(ctx, result.tok)},
-            {"has_next_token", slot.has_next_token},
-            {"n_remain",       slot.n_remaining},
-            {"n_decoded",      slot.n_decoded},
-            {"stopped_eos",    slot.stopped_eos},
-            {"stopped_word",   slot.stopped_word},
-            {"stopped_limit",  slot.stopped_limit},
-            {"stopping_word",  slot.stopping_word},
-        });
+        SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: '%s'\n", slot.n_decoded, slot.n_remaining, token_str.c_str());
  
          return slot.has_next_token; // continue
      }
@@ -1307,10 +1245,7 @@ struct server_context {
      }
  
      void send_error(const int id_task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
-        LOG_ERROR("task error", {
-            {"id_task", id_task},
-            {"error", error},
-        });
+        SRV_ERR("task id = %d, error: %s\n", id_task, error.c_str());
  
          server_task_result res;
          res.id       = id_task;
@@ -1429,10 +1364,7 @@ struct server_context {
              }
  
              if (embd == NULL) {
-                LOG_ERROR("failed to get embeddings", {
-                    {"token",  batch.token [i]},
-                        {"seq_id", batch.seq_id[i][0]}
-                });
+                SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
  
                  res.data = json {
                      {"embedding", std::vector<float>(n_embd, 0.0f)},
@@ -1449,6 +1381,8 @@ struct server_context {
              };
          }
  
+        SLT_DBG(slot, "%s", "sending embeddings\n");
+
          queue_results.send(res);
      }
  
@@ -1465,7 +1399,7 @@ struct server_context {
              task.type      = SERVER_TASK_TYPE_COMPLETION;
              if (replace_prompt) {
                  task.data  = task_data;
-                task.data["prompt"] = prompt;
+                task.data["prompt"] = std::move(prompt);
              } else {
                  task.data  = std::move(task_data);
              }
@@ -1509,7 +1443,8 @@ struct server_context {
          std::vector<server_task> cancel_tasks;
          cancel_tasks.reserve(id_tasks.size());
          for (const auto & id_task : id_tasks) {
-            LOG_VERBOSE("cancel task", {{"id_task", id_task}});
+            SRV_WRN("cancel task, id_task = %d\n", id_task);
+
              server_task task;
              task.type      = SERVER_TASK_TYPE_CANCEL;
              task.id_target = id_task;
@@ -1521,7 +1456,10 @@ struct server_context {
      }
  
      // receive the results from task(s) created by create_tasks_cmpl
-    void receive_cmpl_results(const std::unordered_set<int> & id_tasks, std::function<void(std::vector<server_task_result>&)> result_handler, std::function<void(json)> error_handler) {
+    void receive_cmpl_results(
+            const std::unordered_set<int> & id_tasks,
+            const std::function<void(std::vector<server_task_result>&)> & result_handler,
+            const std::function<void(json)> & error_handler) {
          // TODO: currently, there is no way to detect the client has cancelled the request
          std::vector<server_task_result> results(id_tasks.size());
          for (size_t i = 0; i < id_tasks.size(); i++) {
@@ -1540,7 +1478,10 @@ struct server_context {
      }
  
      // receive the results from task(s) created by create_tasks_cmpl, in stream mode
-    void receive_cmpl_results_stream(const std::unordered_set<int> & id_tasks, std::function<bool(server_task_result&)> result_handler, std::function<void(json)> error_handler) {
+    void receive_cmpl_results_stream(
+            const std::unordered_set<int> & id_tasks, const
+            std::function<bool(server_task_result&)> & result_handler, const
+            std::function<void(json)> & error_handler) {
          size_t n_finished = 0;
          while (true) {
              server_task_result result = queue_results.recv(id_tasks);
@@ -1588,13 +1529,13 @@ struct server_context {
  
                      if (slot == nullptr) {
                          // if no slot is available, we defer this task for processing later
-                        LOG_VERBOSE("no slot is available", {{"id_task", task.id}});
+                        SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id);
                          queue_tasks.defer(task);
                          break;
                      }
                      if (slot->is_processing()) {
                          // if requested slot is unavailable, we defer this task for processing later
-                        LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+                        SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
                          queue_tasks.defer(task);
                          break;
                      }
@@ -1616,7 +1557,7 @@ struct server_context {
                      slot->index     = json_value(task.data, "index", 0);
  
                      if (!launch_slot_with_task(*slot, task)) {
-                        LOG_ERROR("error while launching slot", task.data);
+                        SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id);
                          break;
                      }
                  } break;
@@ -1665,18 +1606,7 @@ struct server_context {
  
                          slots_data.push_back(slot_data);
                      }
-                    LOG_INFO("slot data", {
-                        {"id_task",            task.id},
-                        {"n_idle_slots",       n_idle_slots},
-                        {"n_processing_slots", n_processing_slots}
-                    });
-
-                    LOG_VERBOSE("slot data", {
-                        {"id_task",            task.id},
-                        {"n_idle_slots",       n_idle_slots},
-                        {"n_processing_slots", n_processing_slots},
-                        {"slots",              slots_data}
-                    });
+                    SRV_DBG("n_idle_slots = %d, n_processing_slots = %d\n", n_idle_slots, n_processing_slots);
  
                      server_task_result res;
                      res.id       = task.id;
@@ -1722,7 +1652,7 @@ struct server_context {
                      }
                      if (slot->is_processing()) {
                          // if requested slot is unavailable, we defer this task for processing later
-                        LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+                        SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
                          queue_tasks.defer(task);
                          break;
                      }
@@ -1763,7 +1693,7 @@ struct server_context {
                      }
                      if (slot->is_processing()) {
                          // if requested slot is unavailable, we defer this task for processing later
-                        LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+                        SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
                          queue_tasks.defer(task);
                          break;
                      }
@@ -1811,7 +1741,7 @@ struct server_context {
                      }
                      if (slot->is_processing()) {
                          // if requested slot is unavailable, we defer this task for processing later
-                        LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+                        SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
                          queue_tasks.defer(task);
                          break;
                      }
@@ -1833,7 +1763,7 @@ struct server_context {
                  } break;
              case SERVER_TASK_TYPE_SET_LORA:
                  {
-                    llama_lora_adapters_apply(ctx, lora_adapters);
+                    llama_lora_adapters_apply(ctx, loras);
                      server_task_result result;
                      result.id = task.id;
                      result.stop = true;
@@ -1861,7 +1791,7 @@ struct server_context {
              }
  
              if (all_idle) {
-                LOG_INFO("all slots are idle", {});
+                SRV_INF("%s", "all slots are idle\n");
                  if (system_prompt.empty() && clean_kv_cache) {
                      kv_cache_clear();
                  }
@@ -1871,7 +1801,7 @@ struct server_context {
          }
  
          {
-            LOG_VERBOSE("posting NEXT_RESPONSE", {});
+            SRV_DBG("%s", "posting NEXT_RESPONSE\n");
  
              server_task task;
              task.type      = SERVER_TASK_TYPE_NEXT_RESPONSE;
@@ -1890,17 +1820,7 @@ struct server_context {
                      const int n_left    = (int) system_tokens.size() + slot.n_past - n_keep;
                      const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2);
  
-                    LOG_INFO("slot context shift", {
-                        {"id_slot",         slot.id},
-                        {"id_task",         slot.id_task},
-                        {"n_keep",          n_keep},
-                        {"n_left",          n_left},
-                        {"n_discard",       n_discard},
-                        {"n_ctx",           n_ctx},
-                        {"n_past",          slot.n_past},
-                        {"n_system_tokens", system_tokens.size()},
-                        {"n_cache_tokens",  slot.cache_tokens.size()}
-                    });
+                    SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
  
                      llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep            , n_keep + n_discard);
                      llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
@@ -1943,15 +1863,8 @@ struct server_context {
                  slot.cache_tokens.push_back(slot.sampled);
              }
  
-            LOG_VERBOSE("slot decode token", {
-                {"id_slot",         slot.id},
-                {"id_task",         slot.id_task},
-                {"n_ctx",           n_ctx},
-                {"n_past",          slot.n_past},
-                {"n_system_tokens", system_tokens.size()},
-                {"n_cache_tokens",  slot.cache_tokens.size()},
-                {"truncated",       slot.truncated}
-            });
+            SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_system_tokens = %d, n_cache_tokens = %d, truncated = %d\n",
+                    slot.n_ctx, slot.n_past, (int) system_tokens.size(), (int) slot.cache_tokens.size(), slot.truncated);
          }
  
          // process in chunks of params.n_batch
@@ -1972,10 +1885,7 @@ struct server_context {
  
                      // we haven't tokenized the prompt yet - do it now:
                      if (prompt_tokens.empty()) {
-                        LOG_VERBOSE("tokenizing prompt", {
-                            {"id_slot", slot.id},
-                            {"id_task", slot.id_task}
-                        });
+                        SLT_INF(slot, "tokenizing prompt, len = %d\n", (int) slot.prompt.size());
  
                          slot.t_start_process_prompt = ggml_time_us();
                          slot.t_start_generation = 0;
@@ -2019,21 +1929,11 @@ struct server_context {
                          slot.n_past = 0;
                          slot.n_prompt_tokens = prompt_tokens.size();
  
-                        LOG_VERBOSE("prompt tokenized", {
-                            {"id_slot",         slot.id},
-                            {"id_task",         slot.id_task},
-                            {"n_ctx",           slot.n_ctx},
-                            {"n_keep",          slot.params.n_keep},
-                            {"n_prompt_tokens", slot.n_prompt_tokens},
-                            {"prompt_tokens",   tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())},
-                        });
+                        SLT_INF(slot, "prompt tokenized, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens);
  
                          // empty prompt passed -> release the slot and send empty response
                          if (prompt_tokens.empty()) {
-                            LOG_INFO("empty prompt - releasing slot", {
-                                {"id_slot", slot.id},
-                                {"id_task", slot.id_task}
-                            });
+                            SLT_WRN(slot, "%s", "empty prompt - releasing slot\n");
  
                              slot.release();
                              slot.print_timings();
@@ -2075,15 +1975,7 @@ struct server_context {
                                  slot.truncated = true;
                                  slot.n_prompt_tokens = prompt_tokens.size();
  
-                                LOG_VERBOSE("input truncated", {
-                                    {"id_slot",         slot.id},
-                                    {"id_task",         slot.id_task},
-                                    {"n_ctx",           slot.n_ctx},
-                                    {"n_keep",          slot.params.n_keep},
-                                    {"n_left",          n_left},
-                                    {"n_prompt_tokens", slot.n_prompt_tokens},
-                                    {"prompt_tokens",   tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())},
-                                });
+                                SLT_WRN(slot, "input truncated, n_ctx = %d, n_keep = %d, n_left = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, n_left, slot.n_prompt_tokens);
  
                                  GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
                              }
@@ -2108,10 +2000,7 @@ struct server_context {
  
                          if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) {
                              // we have to evaluate at least 1 token to generate logits.
-                            LOG_INFO("we have to evaluate at least 1 token to generate logits", {
-                                { "id_slot", slot.id },
-                                { "id_task", slot.id_task }
-                            });
+                            SLT_WRN(slot, "need to evaluate at least 1 token to generate logits, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens);
  
                              slot.n_past--;
                              if (slot.ga_i > 0) {
@@ -2160,11 +2049,7 @@ struct server_context {
                      // remove the non-common part from the cache
                      slot.cache_tokens.resize(slot.n_past);
  
-                    LOG_INFO("kv cache rm [p0, end)", {
-                        { "id_slot", slot.id },
-                        { "id_task", slot.id_task },
-                        { "p0",      p0 }
-                    });
+                    SLT_INF(slot, "kv cache rm [%d, end)\n", p0);
  
                      int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
  
@@ -2193,13 +2078,7 @@ struct server_context {
                          slot_npast++;
                      }
  
-                    LOG_VERBOSE("prompt processing progress", {
-                        {"id_slot",  slot.id},
-                        {"n_past",   slot.n_past},
-                        {"n_ctx",    n_ctx},
-                        {"n_tokens", batch.n_tokens},
-                        {"progress", (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens},
-                    });
+                    SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
  
                      // entire prompt has been processed
                      if (slot.n_past == slot.n_prompt_tokens) {
@@ -2213,12 +2092,7 @@ struct server_context {
                          slot.n_decoded = 0;
                          slot.i_batch   = batch.n_tokens - 1;
  
-                        LOG_VERBOSE("prompt done", {
-                            {"id_slot",  slot.id},
-                            {"n_past",   slot.n_past},
-                            {"n_ctx",    n_ctx},
-                            {"n_tokens", batch.n_tokens},
-                        });
+                        SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch.n_tokens);
                      }
                  }
  
@@ -2229,13 +2103,11 @@ struct server_context {
          }
  
          if (batch.n_tokens == 0) {
-            LOG_VERBOSE("no tokens to decode", {});
+            SRV_WRN("%s", "no tokens to decode\n");
              return;
          }
  
-        LOG_VERBOSE("decoding batch", {
-            {"n_tokens", batch.n_tokens},
-        });
+        SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);
  
          // make sure we're in the right embedding mode
          llama_set_embeddings(ctx, batch_type == 1);
@@ -2253,10 +2125,9 @@ struct server_context {
                          const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
                          const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;
  
-                        LOG_TEE("\n");
-                        LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
-                        LOG_TEE("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
-                        LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
+                        SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
+                        SLT_DBG(slot, "div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
+                        SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
  
                          llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i, slot.n_past_se, ib * bd);
                          llama_kv_cache_seq_div(ctx, slot.id + 1, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n);
@@ -2266,7 +2137,7 @@ struct server_context {
  
                          slot.ga_i += slot.ga_w / slot.ga_n;
  
-                        LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
+                        SLT_DBG(slot, "\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
                      }
  
                      slot.n_past_se += n_tokens;
@@ -2290,11 +2161,7 @@ struct server_context {
              if (ret != 0) {
                  if (n_batch == 1 || ret < 0) {
                      // if you get here, it means the KV cache is full - try increasing it via the context size
-                    LOG_ERROR("failed to decode the batch: KV cache is full - try increasing it via the context size", {
-                        {"i",       i},
-                        {"n_batch", n_batch},
-                        {"ret",     ret},
-                    });
+                    SRV_ERR("failed to decode the batch: KV cache is full - try increasing it via the context size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret);
                      for (auto & slot : slots) {
                          slot.release();
                          send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size.");
@@ -2306,11 +2173,7 @@ struct server_context {
                  n_batch /= 2;
                  i -= n_batch;
  
-                LOG_WARNING("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation", {
-                    {"i",       i},
-                    {"n_batch", n_batch},
-                    {"ret",     ret},
-                });
+                SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret);
  
                  continue; // continue loop of n_batch
              }
@@ -2370,7 +2233,7 @@ struct server_context {
              }
          }
  
-        LOG_VERBOSE("run slots completed", {});
+        SRV_DBG("%s", "run slots completed\n");
      }
  
      json model_meta() const {
@@ -2391,19 +2254,18 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp
          return;
      }
  
-    LOG_INFO("request", {
-        {"remote_addr", req.remote_addr},
-        {"remote_port", req.remote_port},
-        {"status",      res.status},
-        {"method",      req.method},
-        {"path",        req.path},
-        {"params",      req.params},
-    });
+    //LOG_INFO("request", {
+    //    {"remote_addr", req.remote_addr},
+    //    {"remote_port", req.remote_port},
+    //    {"status",      res.status},
+    //    {"method",      req.method},
+    //    {"path",        req.path},
+    //    {"params",      req.params},
+    //});
+    LOG_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
  
-    LOG_VERBOSE("request", {
-        {"request",  req.body},
-        {"response", res.body},
-    });
+    LOG_DBG("request:  %s\n", req.body.c_str());
+    LOG_DBG("response: %s\n", res.body.c_str());
  }
  
  std::function<void(int)> shutdown_handler;
@@ -2421,9 +2283,6 @@ inline void signal_handler(int signal) {
  }
  
  int main(int argc, char ** argv) {
-#if SERVER_VERBOSE != 1
-    log_disable();
-#endif
      // own arguments required by this example
      gpt_params params;
  
@@ -2431,9 +2290,11 @@ int main(int argc, char ** argv) {
          return 1;
      }
  
-    // TODO: not great to use extern vars
-    server_log_json = params.log_json;
-    server_verbose = params.verbosity > 0;
+    gpt_init();
+
+    // enabling this will output extra debug information in the HTTP responses from the server
+    // see format_final_response_oaicompat()
+    const bool verbose = params.verbosity > 9;
  
      // struct that contains llama context and inference
      server_context ctx_server;
@@ -2449,17 +2310,10 @@ int main(int argc, char ** argv) {
      llama_backend_init();
      llama_numa_init(params.numa);
  
-    LOG_INFO("build info", {
-        {"build",  LLAMA_BUILD_NUMBER},
-        {"commit", LLAMA_COMMIT}
-    });
-
-    LOG_INFO("system info", {
-        {"n_threads",       params.cpuparams.n_threads},
-        {"n_threads_batch", params.cpuparams_batch.n_threads},
-        {"total_threads",   std::thread::hardware_concurrency()},
-        {"system_info",     llama_print_system_info()},
-    });
+    LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency());
+    LOG_INF("\n");
+    LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+    LOG_INF("\n");
  
      std::unique_ptr<httplib::Server> svr;
  #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
@@ -2491,13 +2345,13 @@ int main(int argc, char ** argv) {
  
      svr->set_logger(log_server_request);
  
-    auto res_error = [](httplib::Response & res, json error_data) {
+    auto res_error = [](httplib::Response & res, const json & error_data) {
          json final_response {{"error", error_data}};
          res.set_content(final_response.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON);
          res.status = json_value(error_data, "code", 500);
      };
  
-    auto res_ok = [](httplib::Response & res, json data) {
+    auto res_ok = [](httplib::Response & res, const json & data) {
          res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON);
          res.status = 200;
      };
@@ -2505,7 +2359,7 @@ int main(int argc, char ** argv) {
      svr->set_exception_handler([&res_error](const httplib::Request &, httplib::Response & res, std::exception_ptr ep) {
          std::string message;
          try {
-            std::rethrow_exception(std::move(ep));
+            std::rethrow_exception(ep);
          } catch (std::exception & e) {
              message = e.what();
          } catch (...) {
@@ -2513,7 +2367,7 @@ int main(int argc, char ** argv) {
          }
  
          json formatted_error = format_error_response(message, ERROR_TYPE_SERVER);
-        LOG_VERBOSE("Got exception", formatted_error);
+        LOG_WRN("got exception: %s\n", formatted_error.dump().c_str());
          res_error(res, formatted_error);
      });
  
@@ -2588,7 +2442,7 @@ int main(int argc, char ** argv) {
          // API key is invalid or not provided
          res_error(res, format_error_response("Invalid API Key", ERROR_TYPE_AUTHENTICATION));
  
-        LOG_WARNING("Unauthorized: Invalid API Key", {});
+        LOG_WRN("Unauthorized: Invalid API Key\n");
  
          return false;
      };
@@ -2925,14 +2779,14 @@ int main(int argc, char ** argv) {
                      }
                      res_ok(res, arr);
                  }
-            }, [&](json error_data) {
+            }, [&](const json & error_data) {
                  res_error(res, error_data);
              });
          } else {
              const auto chunked_content_provider = [task_ids, &ctx_server](size_t, httplib::DataSink & sink) {
-                ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result result) -> bool {
+                ctx_server.receive_cmpl_results_stream(task_ids, [&](const server_task_result & result) -> bool {
                      return server_sent_event(sink, "data", result.data);
-                }, [&](json error_data) {
+                }, [&](const json & error_data) {
                      server_sent_event(sink, "error", error_data);
                  });
                  sink.done();
@@ -2953,7 +2807,7 @@ int main(int argc, char ** argv) {
      };
  
      // TODO: maybe merge this function with "handle_completions_generic"
-    const auto handle_chat_completions = [&ctx_server, &params, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
+    const auto handle_chat_completions = [&ctx_server, &params, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) {
          if (ctx_server.params.embedding) {
              res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
              return;
@@ -2970,16 +2824,16 @@ int main(int argc, char ** argv) {
          const auto completion_id = gen_chatcmplid();
  
          if (!stream) {
-            ctx_server.receive_cmpl_results(task_ids, [&](std::vector<server_task_result> & results) {
+            ctx_server.receive_cmpl_results(task_ids, [&](const std::vector<server_task_result> & results) {
                  // multitask is never support in chat completion, there is only one result
-                json result_oai = format_final_response_oaicompat(data, results[0].data, completion_id);
+                json result_oai = format_final_response_oaicompat(data, results[0].data, completion_id, /*.streaming =*/ false, verbose);
                  res_ok(res, result_oai);
-            }, [&](json error_data) {
+            }, [&](const json & error_data) {
                  res_error(res, error_data);
              });
          } else {
              const auto chunked_content_provider = [task_ids, &ctx_server, completion_id](size_t, httplib::DataSink & sink) {
-                ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result result) -> bool {
+                ctx_server.receive_cmpl_results_stream(task_ids, [&](const server_task_result & result) -> bool {
                      std::vector<json> result_array = format_partial_response_oaicompat(result.data, completion_id);
                      for (auto & event_data : result_array) {
                          if (event_data.empty()) {
@@ -2990,7 +2844,7 @@ int main(int argc, char ** argv) {
                          }
                      }
                      return true; // ok
-                }, [&](json error_data) {
+                }, [&](const json & error_data) {
                      server_sent_event(sink, "error", error_data);
                  });
                  static const std::string ev_done = "data: [DONE]\n\n";
@@ -3103,7 +2957,7 @@ int main(int argc, char ** argv) {
                  for (const auto & res : results) {
                      responses.push_back(res.data);
                  }
-            }, [&](json error_data) {
+            }, [&](const json & error_data) {
                  res_error(res, error_data);
                  error = true;
              });
@@ -3122,12 +2976,12 @@ int main(int argc, char ** argv) {
  
      const auto handle_lora_adapters_list = [&](const httplib::Request &, httplib::Response & res) {
          json result = json::array();
-        for (size_t i = 0; i < ctx_server.lora_adapters.size(); ++i) {
-            auto & la = ctx_server.lora_adapters[i];
+        for (size_t i = 0; i < ctx_server.loras.size(); ++i) {
+            auto & lora = ctx_server.loras[i];
              result.push_back({
                  {"id", i},
-                {"path", la.path},
-                {"scale", la.scale},
+                {"path", lora.path},
+                {"scale", lora.scale},
              });
          }
          res_ok(res, result);
@@ -3136,11 +2990,11 @@ int main(int argc, char ** argv) {
  
      const auto handle_lora_adapters_apply = [&](const httplib::Request & req, httplib::Response & res) {
          const std::vector<json> body = json::parse(req.body);
-        int max_idx = ctx_server.lora_adapters.size();
+        int max_idx = ctx_server.loras.size();
  
          // clear existing value
-        for (auto & la : ctx_server.lora_adapters) {
-            la.scale = 0.0f;
+        for (auto & lora : ctx_server.loras) {
+            lora.scale = 0.0f;
          }
  
          // set value
@@ -3148,7 +3002,7 @@ int main(int argc, char ** argv) {
              int id      = entry.at("id");
              float scale = entry.at("scale");
              if (0 <= id && id < max_idx) {
-                ctx_server.lora_adapters[id].scale = scale;
+                ctx_server.loras[id].scale = scale;
              } else {
                  throw std::runtime_error("invalid adapter id");
              }
@@ -3243,58 +3097,58 @@ int main(int argc, char ** argv) {
  
      // bind HTTP listen port, run the HTTP server in a thread
      if (!svr->bind_to_port(params.hostname, params.port)) {
-        LOG_ERROR("couldn't bind HTTP server socket", {
-            {"hostname", params.hostname},
-            {"port", params.port},
-        });
+        //LOG_ERROR("couldn't bind HTTP server socket", {
+        //    {"hostname", params.hostname},
+        //    {"port", params.port},
+        //});
+        LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, params.hostname.c_str(), params.port);
          clean_up();
-        LOG_ERROR("exiting due to HTTP server error", {});
          return 1;
      }
      std::thread t([&]() { svr->listen_after_bind(); });
      svr->wait_until_ready();
  
-    LOG_INFO("HTTP server is listening", log_data);
+    //LOG_INFO("HTTP server is listening", log_data);
+    LOG_INF("%s: HTTP server is listening, hostname: %s, port: %d, http threads: %d\n", __func__, params.hostname.c_str(), params.port, params.n_threads_http);
  
      // load the model
-    LOG_INFO("loading model", log_data);
+    LOG_INF("%s: loading model\n", __func__);
+
      if (!ctx_server.load_model(params)) {
          clean_up();
          t.join();
-        LOG_ERROR("exiting due to model loading error", {});
+        LOG_ERR("%s: exiting due to model loading error\n", __func__);
          return 1;
-    } else {
-        ctx_server.init();
-        state.store(SERVER_STATE_READY);
+    }
  
-        LOG_INFO("model loaded", {});
+    ctx_server.init();
+    state.store(SERVER_STATE_READY);
  
-        // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
-        if (params.chat_template.empty()) {
-            if (!ctx_server.validate_model_chat_template()) {
-                LOG_WARNING("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
-                params.chat_template = "chatml";
-            }
-        }
+    LOG_INF("%s: model loaded\n", __func__);
  
-        // print sample chat example to make it clear which template is used
-        {
-            LOG_INFO("chat template", {
-                {"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)},
-                {"built_in",     params.chat_template.empty()},
-            });
+    // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
+    if (params.chat_template.empty()) {
+        if (!ctx_server.validate_model_chat_template()) {
+            LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
+            params.chat_template = "chatml";
          }
+    }
  
-        ctx_server.queue_tasks.on_new_task(std::bind(
-            &server_context::process_single_task, &ctx_server, std::placeholders::_1));
-        ctx_server.queue_tasks.on_update_slots(std::bind(
-            &server_context::update_slots, &ctx_server));
+    // print sample chat example to make it clear which template is used
+    LOG_INF("%s: chat template, built_in: %d, chat_example: '%s\n'", __func__, params.chat_template.empty(), llama_chat_format_example(ctx_server.model, params.chat_template).c_str());
  
-        shutdown_handler = [&](int) {
-            ctx_server.queue_tasks.terminate();
-        };
-        ctx_server.queue_tasks.start_loop();
-    }
+    ctx_server.queue_tasks.on_new_task(std::bind(
+                &server_context::process_single_task, &ctx_server, std::placeholders::_1));
+    ctx_server.queue_tasks.on_update_slots(std::bind(
+                &server_context::update_slots, &ctx_server));
+
+    shutdown_handler = [&](int) {
+        ctx_server.queue_tasks.terminate();
+    };
+
+    LOG_INF("%s: server is listening on %s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
+
+    ctx_server.queue_tasks.start_loop();
  
  #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
      struct sigaction sigint_action;
diff --git a/examples/server/tests/.gitignore b/examples/server/tests/.gitignore

new file mode 100644 (file)

index 0000000..1d17dae
--- /dev/null
+++ b/examples/server/tests/.gitignore
@@ -0,0 +1 @@
+.venv
diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md

index 5e6cb277bc81366886f20b396c1bf7238bacb286..10f22c4471ea7f0b52896f1069ea8e889d24a2a7 100644 (file)
--- a/examples/server/tests/README.md
+++ b/examples/server/tests/README.md
@@ -40,7 +40,6 @@ It's possible to override some scenario steps values with environment variables:
  | `PORT`                   | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
  | `LLAMA_SERVER_BIN_PATH`  | to change the server binary path, default: `../../../build/bin/llama-server`                         |
  | `DEBUG`                  | "ON" to enable steps and server verbose mode `--verbose`                                       |
-| `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format                                                       |
  | `N_GPU_LAYERS`           | number of model layers to offload to VRAM `-ngl --n-gpu-layers`                                |
  
  ### Run @bug, @wip or @wrong_usage annotated scenario
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py

index 0f4249b139e7dd7013e4cd631752dc88b2b040b1..062f084be42d4a671d4ca35bd70f5f0debcc37dd 100644 (file)
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -1372,8 +1372,6 @@ def start_server_background(context):
          server_args.append('--verbose')
      if context.lora_file:
          server_args.extend(['--lora', context.lora_file])
-    if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
-        server_args.extend(['--log-format', "text"])
  
      args = [str(arg) for arg in [context.server_path, *server_args]]
      print(f"bench: starting server with: {' '.join(args)}")
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp

index adb1a1cb968523d1d2d81392a40878addda41d58..537c8a22324384aa87803a63cbb4c654c8a6f9b2 100644 (file)
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -1,7 +1,8 @@
  #pragma once
  
-#include "llama.h"
  #include "common.h"
+#include "log.h"
+#include "llama.h"
  
  #ifndef NDEBUG
  // crash the server in debug mode, otherwise send an http 500 error
@@ -15,10 +16,10 @@
  #define JSON_ASSERT GGML_ASSERT
  #include "json.hpp"
  
+#include <random>
+#include <sstream>
  #include <string>
  #include <vector>
-#include <sstream>
-#include <random>
  
  #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
  
@@ -35,32 +36,6 @@ enum error_type {
      ERROR_TYPE_NOT_SUPPORTED, // custom error
  };
  
-extern bool server_verbose;
-extern bool server_log_json;
-
-#ifndef SERVER_VERBOSE
-#define SERVER_VERBOSE 1
-#endif
-
-#if SERVER_VERBOSE != 1
-#define LOG_VERBOSE(MSG, ...)
-#else
-#define LOG_VERBOSE(MSG, ...)                                            \
-    do                                                                   \
-    {                                                                    \
-        if (server_verbose)                                              \
-        {                                                                \
-            server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \
-        }                                                                \
-    } while (0)
-#endif
-
-#define LOG_ERROR(  MSG, ...) server_log("ERR",  __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_INFO(   MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
-
-static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra);
-
  template <typename T>
  static T json_value(const json & body, const std::string & key, const T & default_value) {
      // Fallback null to default value
@@ -68,9 +43,7 @@ static T json_value(const json & body, const std::string & key, const T & defaul
          try {
              return body.at(key);
          } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) {
-            std::stringstream ss;
-            ss << "Wrong type supplied for parameter '" << key << "'. Expected '" << json(default_value).type_name() << "', using default value.";
-            LOG_WARNING(ss.str().c_str(), body);
+            LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value\n", key.c_str(), json(default_value).type_name());
              return default_value;
          }
      } else {
@@ -78,48 +51,6 @@ static T json_value(const json & body, const std::string & key, const T & defaul
      }
  }
  
-static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra) {
-    std::stringstream ss_tid;
-    ss_tid << std::this_thread::get_id();
-    json log = json{
-        {"tid",       ss_tid.str()},
-        {"timestamp", time(nullptr)},
-    };
-
-    if (server_log_json) {
-        log.merge_patch({
-            {"level",    level},
-            {"function", function},
-            {"line",     line},
-            {"msg",      message},
-        });
-
-        if (!extra.empty()) {
-            log.merge_patch(extra);
-        }
-
-        printf("%s\n", log.dump(-1, ' ', false, json::error_handler_t::replace).c_str());
-    } else {
-        char buf[1024];
-        snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
-
-        if (!extra.empty()) {
-            log.merge_patch(extra);
-        }
-        std::stringstream ss;
-        ss << buf << " |";
-        for (const auto & el : log.items())
-        {
-            const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
-            ss << " " << el.key() << "=" << value;
-        }
-
-        const std::string str = ss.str();
-        printf("%.*s\n", (int)str.size(), str.data());
-    }
-    fflush(stdout);
-}
-
  //
  // chat template utils
  //
@@ -153,8 +84,9 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
          chat.push_back({role, content});
      }
  
-    auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
-    LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
+    const auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
+    LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
+
      return formatted_chat;
  }
  
@@ -243,10 +175,7 @@ static std::string random_string() {
  }
  
  static std::string gen_chatcmplid() {
-    std::stringstream chatcmplid;
-    chatcmplid << "chatcmpl-" << random_string();
-
-    return chatcmplid.str();
+    return "chatcmpl-" + random_string();
  }
  
  //
@@ -287,7 +216,7 @@ static size_t find_partial_stop_string(const std::string &stop, const std::strin
      return std::string::npos;
  }
  
-static bool json_is_array_of_numbers(json data) {
+static bool json_is_array_of_numbers(const json & data) {
      if (data.is_array()) {
          for (const auto & e : data) {
              if (!e.is_number()) {
@@ -363,15 +292,13 @@ static json probs_vector_to_json(const llama_context * ctx, const std::vector<co
      return out;
  }
  
-static bool server_sent_event(httplib::DataSink & sink, const char * event, json & data) {
+static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) {
      const std::string str =
          std::string(event) + ": " +
          data.dump(-1, ' ', false, json::error_handler_t::replace) +
-        "\n\n";
+        "\n\n"; // note: these newlines are important (not sure why though, if you know, add a comment to explain)
  
-    LOG_VERBOSE("data stream", {
-        { "to_send", str }
-    });
+    LOG_DBG("data stream, to_send: %s", str.c_str());
  
      return sink.write(str.c_str(), str.size());
  }
@@ -425,7 +352,7 @@ static json oaicompat_completion_params_parse(
  
      // Params supported by OAI but unsupported by llama.cpp
      static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
-    for (auto & param : unsupported_params) {
+    for (const auto & param : unsupported_params) {
          if (body.contains(param)) {
              throw std::runtime_error("Unsupported param: " + param);
          }
@@ -444,7 +371,7 @@ static json oaicompat_completion_params_parse(
      return llama_params;
  }
  
-static json format_final_response_oaicompat(const json & request, json result, const std::string & completion_id, bool streaming = false) {
+static json format_final_response_oaicompat(const json & request, const json & result, const std::string & completion_id, bool streaming = false, bool verbose = false) {
      bool stopped_word        = result.count("stopped_word") != 0;
      bool stopped_eos         = json_value(result, "stopped_eos", false);
      int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
@@ -481,7 +408,8 @@ static json format_final_response_oaicompat(const json & request, json result, c
          {"id", completion_id}
      };
  
-    if (server_verbose) {
+    // extra fields for debugging purposes
+    if (verbose) {
          res["__verbose"] = result;
      }
  
@@ -493,7 +421,7 @@ static json format_final_response_oaicompat(const json & request, json result, c
  }
  
  // return value is vector as there is one case where we might need to generate two responses
-static std::vector<json> format_partial_response_oaicompat(json result, const std::string & completion_id) {
+static std::vector<json> format_partial_response_oaicompat(const json & result, const std::string & completion_id) {
      if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
          return std::vector<json>({result});
      }
@@ -595,7 +523,7 @@ static std::vector<json> format_partial_response_oaicompat(json result, const st
  static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
      json data = json::array();
      int i = 0;
-    for (auto & elem : embeddings) {
+    for (const auto & elem : embeddings) {
          data.push_back(json{
              {"embedding", json_value(elem, "embedding", json::array())},
              {"index",     i++},
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp

index 0c923d4edf68f15f626af7b94c9769cfca7db737..c2b7267c8133e8bd4c01f43df037dfc9767c05b7 100644 (file)
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -1,16 +1,14 @@
  #include "arg.h"
  #include "common.h"
+#include "log.h"
  #include "llama.h"
  
-#include <cmath>
-#include <cstdio>
-#include <string>
  #include <vector>
  
  static void print_usage(int, char ** argv) {
-    LOG_TEE("\nexample usage:\n");
-    LOG_TEE("\n    %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
-    LOG_TEE("\n");
+    LOG("\nexample usage:\n");
+    LOG("\n    %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
+    LOG("\n");
  }
  
  int main(int argc, char ** argv) {
@@ -23,6 +21,8 @@ int main(int argc, char ** argv) {
          return 1;
      }
  
+    gpt_init();
+
      // total length of the sequence including the prompt
      const int n_predict = params.n_predict;
  
@@ -69,25 +69,24 @@ int main(int argc, char ** argv) {
      const int n_ctx    = llama_n_ctx(ctx);
      const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
  
-    LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
+    LOG("\n");
+    LOG_INF("%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
  
      // make sure the KV cache is big enough to hold all the prompt and generated tokens
      if (n_kv_req > n_ctx) {
-        LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
-        LOG_TEE("%s:        either reduce n_predict or increase n_ctx\n", __func__);
+        LOG_ERR("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
+        LOG_ERR("%s:        either reduce n_predict or increase n_ctx\n", __func__);
          return 1;
      }
  
      // print the prompt token-by-token
  
-    fprintf(stderr, "\n");
+    LOG("\n");
  
      for (auto id : tokens_list) {
-        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
+        LOG("%s", llama_token_to_piece(ctx, id).c_str());
      }
  
-    fflush(stderr);
-
      // create a llama_batch with size 512
      // we use this object to submit token data for decoding
  
@@ -102,7 +101,7 @@ int main(int argc, char ** argv) {
      batch.logits[batch.n_tokens - 1] = true;
  
      if (llama_decode(ctx, batch) != 0) {
-        LOG_TEE("%s: llama_decode() failed\n", __func__);
+        LOG("%s: llama_decode() failed\n", __func__);
          return 1;
      }
  
@@ -116,16 +115,16 @@ int main(int argc, char ** argv) {
      while (n_cur <= n_predict) {
          // sample the next token
          {
-            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
+            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, -1);
  
              // is it an end of generation?
              if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
-                LOG_TEE("\n");
+                LOG("\n");
  
                  break;
              }
  
-            LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
+            LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
              fflush(stdout);
  
              // prepare the next batch
@@ -141,23 +140,23 @@ int main(int argc, char ** argv) {
  
          // evaluate the current batch with the transformer model
          if (llama_decode(ctx, batch)) {
-            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
+            LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
              return 1;
          }
      }
  
-    LOG_TEE("\n");
+    LOG("\n");
  
      const auto t_main_end = ggml_time_us();
  
-    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
+    LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
              __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
  
-    LOG_TEE("\n");
+    LOG("\n");
      llama_perf_sampler_print(smpl);
      llama_perf_context_print(ctx);
  
-    fprintf(stderr, "\n");
+    LOG("\n");
  
      llama_batch_free(batch);
      llama_sampler_free(smpl);
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp

index 843579acd2222af39bcbe041c6529d032bdd107e..fbac21811638bdbf5f8861fdef7a975eda8726b0 100644 (file)
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -1,13 +1,16 @@
  #include "arg.h"
  #include "common.h"
  #include "sampling.h"
+#include "log.h"
  #include "llama.h"
  
+#include <algorithm>
  #include <cstdio>
+#include <cstring>
+#include <random>
+#include <set>
  #include <string>
  #include <vector>
-#include <set>
-#include <random>
  
  #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  100
  #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
@@ -33,8 +36,10 @@ int main(int argc, char ** argv) {
          return 1;
      }
  
+    gpt_init();
+
      if (params.model_draft.empty()) {
-        fprintf(stderr, "%s: error: --model-draft is required\n", __func__);
+        LOG_ERR("%s: --model-draft is required\n", __func__);
          return 1;
      }
  
@@ -47,12 +52,6 @@ int main(int argc, char ** argv) {
      std::default_random_engine rng(params.sparams.seed);
      std::uniform_real_distribution<> u_dist;
  
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("speculative", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
-
      // init llama.cpp
      llama_backend_init();
      llama_numa_init(params.numa);
@@ -81,14 +80,14 @@ int main(int argc, char ** argv) {
      ctx_dft = llama_init_dft.context;
  
      const bool vocab_type_tgt = llama_vocab_type(model_tgt);
-    LOG("vocab_type tgt: %d\n", vocab_type_tgt);
+    LOG_DBG("vocab_type tgt: %d\n", vocab_type_tgt);
  
      const bool vocab_type_dft = llama_vocab_type(model_dft);
-    LOG("vocab_type dft: %d\n", vocab_type_dft);
+    LOG_DBG("vocab_type dft: %d\n", vocab_type_dft);
  
      if (vocab_type_tgt != vocab_type_dft) {
-        fprintf(stderr, "%s: error: draft model vocab type must match target model to use speculation but ", __func__);
-        fprintf(stderr, "vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
+        LOG_ERR("%s: draft model vocab type must match target model to use speculation but ", __func__);
+        LOG_ERR("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
          return 1;
      }
  
@@ -98,7 +97,7 @@ int main(int argc, char ** argv) {
          llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
          llama_token_eos(model_tgt) != llama_token_eos(model_dft)
      ) {
-        fprintf(stderr, "%s: error: draft model special tokens must match target model to use speculation\n", __func__);
+        LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
          return 1;
      }
  
@@ -110,8 +109,8 @@ int main(int argc, char ** argv) {
              : n_vocab_dft - n_vocab_tgt;
  
          if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
-            fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__);
-            fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
+            LOG_ERR("%s: draft model vocab must closely match target model to use speculation but ", __func__);
+            LOG_ERR("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
                      n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
              return 1;
          }
@@ -120,8 +119,8 @@ int main(int argc, char ** argv) {
              const char * token_text_tgt = llama_token_get_text(model_tgt, i);
              const char * token_text_dft = llama_token_get_text(model_dft, i);
              if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
-                fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__);
-                fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i,
+                LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__);
+                LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i,
                          llama_token_to_piece(ctx_tgt, i).c_str(),
                          llama_token_to_piece(ctx_dft, i).c_str());
                  return 1;
@@ -138,18 +137,16 @@ int main(int argc, char ** argv) {
      const int max_tokens_list_size = max_context_size - 4;
  
      if ((int) inp.size() > max_tokens_list_size) {
-        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
+        LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
          return 1;
      }
  
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
  
      for (auto id : inp) {
-        fprintf(stderr, "%s", llama_token_to_piece(ctx_tgt, id).c_str());
+        LOG("%s", llama_token_to_piece(ctx_tgt, id).c_str());
      }
  
-    fflush(stderr);
-
      const int n_input = inp.size();
  
      const auto t_enc_start = ggml_time_us();
@@ -211,7 +208,7 @@ int main(int argc, char ** argv) {
              active_seqs.insert(s);
              const auto & tokens = drafts[s].tokens;
  
-            LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str());
+            LOG_DBG("draft %d: %s\n", s, string_from(ctx_dft, tokens).c_str());
          }
  
          int i_dft  = 0;
@@ -254,7 +251,7 @@ int main(int argc, char ** argv) {
                              continue;
                          }
  
-                        LOG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
+                        LOG_DBG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
                          float r = u_dist(rng);
                          llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, true };
  
@@ -272,7 +269,7 @@ int main(int argc, char ** argv) {
                                  break;
                              }
                          }
-                        LOG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt);
+                        LOG_DBG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt);
                          if (r <= p_tgt / p_dft) {
                              s_keep = s;
                              accept = true;
@@ -280,10 +277,10 @@ int main(int argc, char ** argv) {
                              token_str = llama_token_to_piece(ctx_tgt, token_id);
                              gpt_sampler_accept(smpl, token_id, true);
  
-                            LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
+                            LOG_DBG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
                              break;
                          } else {
-                            LOG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
+                            LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
                              drafts[s].active = false;
  
                              // calculate residual probability
@@ -338,7 +335,7 @@ int main(int argc, char ** argv) {
                      if (!accept) {
                          // all drafted tokens were rejected
                          // sample from the target model
-                        LOG("all drafted tokens were rejected, sampling from residual distribution\n");
+                        LOG_DBG("all drafted tokens were rejected, sampling from residual distribution\n");
                          std::vector<float> probs(dist_tgt.size);
                          for (size_t i = 0; i < dist_tgt.size; ++i) {
                              probs[i] = dist_tgt.data[i].p;
@@ -356,13 +353,11 @@ int main(int argc, char ** argv) {
                      // greedy verification
  
                      // sample from the target model
-                    LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
+                    LOG_DBG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
                      token_id = gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
  
                      gpt_sampler_accept(smpl, token_id, true);
  
-                    //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, smpl->prev).c_str());
-
                      token_str = llama_token_to_piece(ctx_tgt, token_id);
  
                      for (int s = 0; s < n_seq_dft; ++s) {
@@ -371,7 +366,7 @@ int main(int argc, char ** argv) {
                          }
  
                          if (i_dft < (int) drafts[s].tokens.size() && token_id == drafts[s].tokens[i_dft]) {
-                            LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str());
+                            LOG_DBG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str());
  
                              s_keep = s;
                              accept = true;
@@ -393,26 +388,24 @@ int main(int argc, char ** argv) {
                      ++i_dft;
                      if (params.use_color) {
                          // Color token according to its origin sequence
-                        printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
+                        LOG("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
                      } else {
-                        printf("%s", token_str.c_str());
+                        LOG("%s", token_str.c_str());
                      }
-                    fflush(stdout);
                      continue;
                  } else {
-                    printf("%s", token_str.c_str());
-                    fflush(stdout);
+                    LOG("%s", token_str.c_str());
                      break;
                  }
              }
          }
  
          {
-            LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str());
+            LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str());
  
              // TODO: simplify
              {
-                LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
+                LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
  
                  llama_kv_cache_seq_keep(ctx_dft, s_keep);
                  llama_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
@@ -439,7 +432,7 @@ int main(int argc, char ** argv) {
              llama_batch_add  (batch_dft, token_id, n_past_dft, { 0 }, true);
  
              llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
-            // LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
+            // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
              llama_decode(ctx_dft, batch_dft);
  
              ++n_past_dft;
@@ -486,7 +479,7 @@ int main(int argc, char ** argv) {
                  const auto * cur_p = gpt_sampler_get_candidates(drafts[s].smpl);
  
                  for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) {
-                    LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
+                    LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
                              k, s, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
                  }
  
@@ -495,7 +488,7 @@ int main(int argc, char ** argv) {
                  // attempt to split the branch if the probability is high enough
                  for (int f = 1; f < 8; ++f) {
                      if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_split) {
-                        LOG("splitting seq %3d into %3d\n", s, n_seq_cur);
+                        LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
  
                          llama_kv_cache_seq_rm(ctx_dft,    n_seq_cur, -1, -1);
                          llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
@@ -584,7 +577,7 @@ int main(int argc, char ** argv) {
                  llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
              }
  
-            // LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
+            // LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
              llama_decode(ctx_tgt, batch_tgt);
              ++n_past_tgt;
          }
@@ -602,23 +595,25 @@ int main(int argc, char ** argv) {
  
      auto t_dec_end = ggml_time_us();
  
-    LOG_TEE("\n\n");
+    LOG("\n\n");
  
-    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
-    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
+    LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
+    LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
  
-    LOG_TEE("\n");
-    LOG_TEE("n_draft   = %d\n", n_draft);
-    LOG_TEE("n_predict = %d\n", n_predict);
-    LOG_TEE("n_drafted = %d\n", n_drafted);
-    LOG_TEE("n_accept  = %d\n", n_accept);
-    LOG_TEE("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);
+    LOG_INF("\n");
+    LOG_INF("n_draft   = %d\n", n_draft);
+    LOG_INF("n_predict = %d\n", n_predict);
+    LOG_INF("n_drafted = %d\n", n_drafted);
+    LOG_INF("n_accept  = %d\n", n_accept);
+    LOG_INF("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);
  
-    LOG_TEE("\ndraft:\n\n");
+    LOG_INF("\n");
+    LOG_INF("draft:\n\n");
      // TODO: print sampling/grammar timings for all drafts
      llama_perf_context_print(ctx_dft);
  
-    LOG_TEE("\ntarget:\n\n");
+    LOG_INF("\n");
+    LOG_INF("target:\n\n");
      gpt_perf_print(ctx_tgt, smpl);
  
      gpt_sampler_free(smpl);
@@ -637,7 +632,7 @@ int main(int argc, char ** argv) {
  
      llama_backend_free();
  
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
  
      return 0;
  }
diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp

index c817be566cf548d1721dae5ffd8103bd4ec39843..a9af6471fd89c8a22dd6beec315b343c7fd32f5b 100644 (file)
--- a/examples/tokenize/tokenize.cpp
+++ b/examples/tokenize/tokenize.cpp
@@ -1,11 +1,13 @@
  #include "common.h"
+//#include "log.h" // TODO: start using log.h
  #include "llama.h"
  
-#include <cmath>
  #include <cstdio>
+#include <cstring>
  #include <fstream>
  #include <string>
  #include <vector>
+#include <iostream> // TODO: remove me
  
  #if defined(_WIN32)
  #define WIN32_LEAN_AND_MEAN
@@ -13,25 +15,25 @@
  #include <shellapi.h>   // For CommandLineToArgvW
  #endif
  
-static void print_usage_information(const char * argv0, FILE * stream) {
-    fprintf(stream, "usage: %s [options]\n\n", argv0);
-    fprintf(stream, "The tokenize program tokenizes a prompt using a given model,\n");
-    fprintf(stream, "and prints the resulting tokens to standard output.\n\n");
-    fprintf(stream, "It needs a model file, a prompt, and optionally other flags\n");
-    fprintf(stream, "to control the behavior of the tokenizer.\n\n");
-    fprintf(stream, "    The possible options are:\n");
-    fprintf(stream, "\n");
-    fprintf(stream, "    -h, --help                           print this help and exit\n");
-    fprintf(stream, "    -m MODEL_PATH, --model MODEL_PATH    path to model.\n");
-    fprintf(stream, "    --ids                                if given, only print numerical token IDs, and not token strings.\n");
-    fprintf(stream, "                                         The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
-    fprintf(stream, "    -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
-    fprintf(stream, "    -p PROMPT, --prompt PROMPT           read prompt from the argument.\n");
-    fprintf(stream, "    --stdin                              read prompt from standard input.\n");
-    fprintf(stream, "    --no-bos                             do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
-    fprintf(stream, "    --no-parse-special                   do not parse control tokens.\n");
-    fprintf(stream, "    --log-disable                        disable logs. Makes stderr quiet when loading the model.\n");
-    fprintf(stream, "    --show-count                         print the total number of tokens.\n");
+static void print_usage_information(const char * argv0) {
+    printf("usage: %s [options]\n\n", argv0);
+    printf("The tokenize program tokenizes a prompt using a given model,\n");
+    printf("and prints the resulting tokens to standard output.\n\n");
+    printf("It needs a model file, a prompt, and optionally other flags\n");
+    printf("to control the behavior of the tokenizer.\n\n");
+    printf("    The possible options are:\n");
+    printf("\n");
+    printf("    -h, --help                           print this help and exit\n");
+    printf("    -m MODEL_PATH, --model MODEL_PATH    path to model.\n");
+    printf("    --ids                                if given, only print numerical token IDs, and not token strings.\n");
+    printf("                                         The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
+    printf("    -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
+    printf("    -p PROMPT, --prompt PROMPT           read prompt from the argument.\n");
+    printf("    --stdin                              read prompt from standard input.\n");
+    printf("    --no-bos                             do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
+    printf("    --no-parse-special                   do not parse control tokens.\n");
+    printf("    --log-disable                        disable logs. Makes stderr quiet when loading the model.\n");
+    printf("    --show-count                         print the total number of tokens.\n");
  }
  
  static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
@@ -185,7 +187,7 @@ int main(int raw_argc, char ** raw_argv) {
      const int argc = argv.size();
  
      if (argc <= 1) {
-        print_usage_information(argv[0].c_str(), stderr);
+        print_usage_information(argv[0].c_str());
          return 1;
      }
  
@@ -214,7 +216,7 @@ int main(int raw_argc, char ** raw_argv) {
      for (; iarg < argc; ++iarg) {
          std::string arg{argv[iarg]};
          if (arg == "-h" || arg == "--help") {
-            print_usage_information(argv[0].c_str(), stdout);
+            print_usage_information(argv[0].c_str());
              return 0;
          }
          else if (arg == "--ids") {
@@ -323,10 +325,6 @@ int main(int raw_argc, char ** raw_argv) {
      // Start actually doing the tokenizing stuff.
      //////
  
-#ifdef LOG_DISABLE_LOGS
-    disable_logging = true;
-#endif
-
      if (disable_logging) {
          llama_log_set(llama_log_callback_null, NULL);
      }
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h

index 13026ab32e663d7ba5a9c5ebe503768a164874d3..a413df35750b1e88dd9ede535b96043176df482e 100644 (file)
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -564,10 +564,11 @@ extern "C" {
      };
  
      enum ggml_log_level {
-        GGML_LOG_LEVEL_ERROR = 2,
-        GGML_LOG_LEVEL_WARN  = 3,
-        GGML_LOG_LEVEL_INFO  = 4,
-        GGML_LOG_LEVEL_DEBUG = 5
+        GGML_LOG_LEVEL_NONE  = 0,
+        GGML_LOG_LEVEL_INFO  = 1,
+        GGML_LOG_LEVEL_WARN  = 2,
+        GGML_LOG_LEVEL_ERROR = 3,
+        GGML_LOG_LEVEL_DEBUG = 4,
      };
  
      enum ggml_tensor_flag {
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m

index 6c85acfecb2ce6f43f16e5593472ca682be8bfb1..7f85dc30dcd8cce3416142fa6a7a69754b662d79 100644 (file)
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -13,13 +13,16 @@
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
  
  #ifdef GGML_METAL_NDEBUG
+#define GGML_METAL_LOG(...)
  #define GGML_METAL_LOG_INFO(...)
  #define GGML_METAL_LOG_WARN(...)
  #define GGML_METAL_LOG_ERROR(...)
  #else
+#define GGML_METAL_LOG(...)       ggml_metal_log(GGML_LOG_LEVEL_NONE,  __VA_ARGS__)
  #define GGML_METAL_LOG_INFO(...)  ggml_metal_log(GGML_LOG_LEVEL_INFO,  __VA_ARGS__)
  #define GGML_METAL_LOG_WARN(...)  ggml_metal_log(GGML_LOG_LEVEL_WARN,  __VA_ARGS__)
  #define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
+#define GGML_METAL_LOG_DEBUG(...) ggml_metal_log(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
  #endif
  
  #define UNUSED(x) (void)(x)
@@ -3183,7 +3186,7 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
  #ifndef GGML_METAL_NDEBUG
  #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
      if (@available(macOS 10.12, iOS 16.0, *)) {
-        GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)",
+        GGML_METAL_LOG_DEBUG("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n",
                  __func__,
                  size_aligned / 1024.0 / 1024.0,
                  device.currentAllocatedSize / 1024.0 / 1024.0,
@@ -3191,8 +3194,6 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
  
          if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
              GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
-        } else {
-            GGML_METAL_LOG_INFO("\n");
          }
      } else {
          GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n",
diff --git a/src/llama-impl.h b/src/llama-impl.h

index 87012617feed146d83b096b22ec6b14840464e96..2bde75ec17c4a8580dbdf56f634f9db2335ddd98 100644 (file)
--- a/src/llama-impl.h
+++ b/src/llama-impl.h
@@ -24,6 +24,7 @@ LLAMA_ATTRIBUTE_FORMAT(2, 3)
  void llama_log_internal        (ggml_log_level level, const char * format, ...);
  void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
  
+#define LLAMA_LOG(...)       llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
  #define LLAMA_LOG_INFO(...)  llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
  #define LLAMA_LOG_WARN(...)  llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
  #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
diff --git a/src/llama.cpp b/src/llama.cpp

index 1986a90fbb983c11bc0441a7120bbc93cd8fd8f9..c917d1c7b5781b179d3d1a7682e76e33213c614c 100644 (file)
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -18074,9 +18074,9 @@ struct llama_model * llama_load_model_from_file(
              unsigned percentage = (unsigned) (100 * progress);
              while (percentage > *cur_percentage_p) {
                  *cur_percentage_p = percentage;
-                LLAMA_LOG_INFO(".");
+                LLAMA_LOG(".");
                  if (percentage >= 100) {
-                    LLAMA_LOG_INFO("\n");
+                    LLAMA_LOG("\n");
                  }
              }
              return true;
@@ -20781,8 +20781,8 @@ static void llama_log_internal_v(ggml_log_level level, const char * format, va_l
      if (len < 128) {
          g_state.log_callback(level, buffer, g_state.log_callback_user_data);
      } else {
-        char* buffer2 = new char[len+1];
-        vsnprintf(buffer2, len+1, format, args_copy);
+        char * buffer2 = new char[len + 1];
+        vsnprintf(buffer2, len + 1, format, args_copy);
          buffer2[len] = 0;
          g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
          delete[] buffer2;
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt

index 30e71cfd44c5161b0bbcf9081d2f50dfefacb484..7dcd3fce827bd6f542a627b1fbbfa690190d045e 100644 (file)
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -108,6 +108,7 @@ llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CU
  #llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-baichuan  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
  
  # llama_target_and_test(test-double-float.cpp) # SLOW
+llama_target_and_test(test-log.cpp)
  llama_target_and_test(test-arg-parser.cpp)
  llama_target_and_test(test-quantize-fns.cpp)
  llama_target_and_test(test-quantize-perf.cpp)
diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp

index f267079105d954ec643befe396b89a1165ef2a63..e07d09733b2ad8ca6d6761f6e9c9b3c4bcb2601d 100644 (file)
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@@ -85,7 +85,7 @@ int main(void) {
  
      argv = {"binary_name", "--verbose"};
      assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
-    assert(params.verbosity == 1);
+    assert(params.verbosity > 1);
  
      argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
      assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
diff --git a/tests/test-log.cpp b/tests/test-log.cpp

new file mode 100644 (file)

index 0000000..2112223
--- /dev/null
+++ b/tests/test-log.cpp
@@ -0,0 +1,39 @@
+#include "log.h"
+
+#include <cstdlib>
+#include <thread>
+
+int main() {
+    const int n_thread = 8;
+
+    std::thread threads[n_thread];
+    for (int i = 0; i < n_thread; i++) {
+        threads[i] = std::thread([i]() {
+            const int n_msg = 1000;
+
+            for (int j = 0; j < n_msg; j++) {
+                const int log_type = std::rand() % 4;
+
+                switch (log_type) {
+                    case 0: LOG_INF("Thread %d: %d\n", i, j); break;
+                    case 1: LOG_WRN("Thread %d: %d\n", i, j); break;
+                    case 2: LOG_ERR("Thread %d: %d\n", i, j); break;
+                    case 3: LOG_DBG("Thread %d: %d\n", i, j); break;
+                    default:
+                        break;
+                }
+
+                if (rand () % 10 < 5) {
+                    gpt_log_set_timestamps(gpt_log_main(), rand() % 2);
+                    gpt_log_set_prefix    (gpt_log_main(), rand() % 2);
+                }
+            }
+        });
+    }
+
+    for (int i = 0; i < n_thread; i++) {
+        threads[i].join();
+    }
+
+    return 0;
+}
author	Georgi Gerganov <redacted>
	Sun, 15 Sep 2024 17:46:12 +0000 (20:46 +0300)
committer	GitHub <redacted>
	Sun, 15 Sep 2024 17:46:12 +0000 (20:46 +0300)
.github/workflows/build.yml		patch \| blob \| history
.github/workflows/server.yml		patch \| blob \| history
Makefile		patch \| blob \| history
ci/run.sh		patch \| blob \| history
common/CMakeLists.txt		patch \| blob \| history
common/arg.cpp		patch \| blob \| history
common/common.cpp		patch \| blob \| history
common/common.h		patch \| blob \| history
common/log.cpp	[new file with mode: 0644]	patch \| blob
common/log.h		patch \| blob \| history
common/ngram-cache.cpp		patch \| blob \| history
common/sampling.cpp		patch \| blob \| history
common/train.cpp		patch \| blob \| history
examples/batched-bench/batched-bench.cpp		patch \| blob \| history
examples/batched/batched.cpp		patch \| blob \| history
examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp		patch \| blob \| history
examples/cvector-generator/cvector-generator.cpp		patch \| blob \| history
examples/embedding/embedding.cpp		patch \| blob \| history
examples/eval-callback/eval-callback.cpp		patch \| blob \| history
examples/export-lora/export-lora.cpp		patch \| blob \| history
examples/gritlm/gritlm.cpp		patch \| blob \| history
examples/imatrix/imatrix.cpp		patch \| blob \| history
examples/infill/infill.cpp		patch \| blob \| history
examples/llava/clip.cpp		patch \| blob \| history
examples/llava/llava-cli.cpp		patch \| blob \| history
examples/llava/llava.cpp		patch \| blob \| history
examples/llava/minicpmv-cli.cpp		patch \| blob \| history
examples/lookahead/lookahead.cpp		patch \| blob \| history
examples/lookup/lookup-stats.cpp		patch \| blob \| history
examples/lookup/lookup.cpp		patch \| blob \| history
examples/main/main.cpp		patch \| blob \| history
examples/parallel/parallel.cpp		patch \| blob \| history
examples/passkey/passkey.cpp		patch \| blob \| history
examples/perplexity/perplexity.cpp		patch \| blob \| history
examples/retrieval/retrieval.cpp		patch \| blob \| history
examples/server/CMakeLists.txt		patch \| blob \| history
examples/server/README.md		patch \| blob \| history
examples/server/bench/README.md		patch \| blob \| history
examples/server/bench/bench.py		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history
examples/server/tests/.gitignore	[new file with mode: 0644]	patch \| blob
examples/server/tests/README.md		patch \| blob \| history
examples/server/tests/features/steps/steps.py		patch \| blob \| history
examples/server/utils.hpp		patch \| blob \| history
examples/simple/simple.cpp		patch \| blob \| history
examples/speculative/speculative.cpp		patch \| blob \| history
examples/tokenize/tokenize.cpp		patch \| blob \| history
ggml/include/ggml.h		patch \| blob \| history
ggml/src/ggml-metal.m		patch \| blob \| history
src/llama-impl.h		patch \| blob \| history
src/llama.cpp		patch \| blob \| history
tests/CMakeLists.txt		patch \| blob \| history
tests/test-arg-parser.cpp		patch \| blob \| history
tests/test-log.cpp	[new file with mode: 0644]	patch \| blob