BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
GGML_NLOOP: 3
GGML_N_THREADS: 1
+ LLAMA_LOG_COLORS: 1
+ LLAMA_LOG_PREFIX: 1
+ LLAMA_LOG_TIMESTAMPS: 1
jobs:
macOS-latest-cmake-arm64:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
+env:
+ LLAMA_LOG_COLORS: 1
+ LLAMA_LOG_PREFIX: 1
+ LLAMA_LOG_TIMESTAMPS: 1
+ LLAMA_LOG_VERBOSITY: 10
+
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
tests/test-grammar-parser \
tests/test-json-schema-to-grammar \
tests/test-llama-grammar \
+ tests/test-log \
tests/test-model-load-cancel \
tests/test-opt \
tests/test-quantize-fns \
DEPRECATE_WARNING := 1
endif
+ifdef LLAMA_DISABLE_LOGS
+REMOVE_WARNING := 1
+endif
+
+ifdef LLAMA_SERVER_VERBOSE
+REMOVE_WARNING := 1
+endif
+
ifndef UNAME_S
UNAME_S := $(shell uname -s)
endif
MK_LDFLAGS += -fsanitize=undefined -g
endif
-ifdef LLAMA_SERVER_VERBOSE
- MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
-endif
-
ifdef LLAMA_SERVER_SSL
MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
MK_LDFLAGS += -lssl -lcrypto
endif
-ifdef LLAMA_DISABLE_LOGS
- MK_CPPFLAGS += -DLOG_DISABLE_LOGS
-endif # LLAMA_DISABLE_LOGS
-
# warnings
WARN_FLAGS = \
-Wall \
OBJ_COMMON = \
common/common.o \
common/arg.o \
+ common/log.o \
common/console.o \
common/ngram-cache.o \
common/sampling.o \
$(info )
endif
+ifdef REMOVE_WARNING
+$(info !!! REMOVAL WARNING !!!)
+$(info The following LLAMA_ options have been removed and are no longer supported)
+$(info - LLAMA_DISABLE_LOGS (https://github.com/ggerganov/llama.cpp/pull/9418))
+$(info - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418))
+$(info )
+endif
+
#
# Build libraries
#
common/arg.h
$(CXX) $(CXXFLAGS) -c $< -o $@
+common/log.o: \
+ common/log.cpp \
+ common/log.h
+ $(CXX) $(CXXFLAGS) -c $< -o $@
+
common/sampling.o: \
common/sampling.cpp \
common/sampling.h \
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
- $(OBJ_GGML) $(OBJ_LLAMA)
+ $(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+tests/test-log: tests/test-log.cpp \
+ $(OBJ_ALL)
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
tests/test-grammar-parser: tests/test-grammar-parser.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
## main
+export LLAMA_LOG_PREFIX=1
+export LLAMA_LOG_TIMESTAMPS=1
+
if [ -z ${GG_BUILD_LOW_PERF} ]; then
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
rm -rf ${SRC}/models-mnt
set(TARGET common)
add_library(${TARGET} STATIC
+ arg.cpp
+ arg.h
base64.hpp
- common.h
common.cpp
- arg.h
- arg.cpp
- sampling.h
- sampling.cpp
- console.h
+ common.h
console.cpp
- json.hpp
+ console.h
json-schema-to-grammar.cpp
- train.h
- train.cpp
- ngram-cache.h
+ json.hpp
+ log.cpp
+ log.h
ngram-cache.cpp
+ ngram-cache.h
+ sampling.cpp
+ sampling.h
+ train.cpp
+ train.h
)
if (BUILD_SHARED_LIBS)
#include "arg.h"
+#include "log.h"
#include "sampling.h"
#include <algorithm>
-#include <string>
-#include <vector>
-#include <set>
+#include <climits>
+#include <cstdarg>
#include <fstream>
#include <regex>
-#include <cstdarg>
-#include <climits>
+#include <set>
+#include <string>
+#include <thread>
+#include <vector>
#include "json-schema-to-grammar.h"
exit(0);
}
));
- add_opt(llama_arg(
- {"-v", "--verbose"},
- "print verbose information",
- [](gpt_params & params) {
- params.verbosity = 1;
- }
- ));
- add_opt(llama_arg(
- {"--verbosity"}, "N",
- format("set specific verbosity level (default: %d)", params.verbosity),
- [](gpt_params & params, int value) {
- params.verbosity = value;
- }
- ));
add_opt(llama_arg(
{"--verbose-prompt"},
format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
[](gpt_params & params) {
params.use_color = true;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
add_opt(llama_arg(
{"-t", "--threads"}, "N",
format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
params.input_prefix = value;
params.enable_chat_template = false;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
add_opt(llama_arg(
{"--in-suffix"}, "STRING",
"string to suffix after user inputs with (default: empty)",
params.input_suffix = value;
params.enable_chat_template = false;
}
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
add_opt(llama_arg(
{"--no-warmup"},
"skip warming up the model with an empty run",
params.system_prompt = system_prompt;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
- add_opt(llama_arg(
- {"--log-format"}, "{text, json}",
- "log output format: json or text (default: json)",
- [](gpt_params & params, const std::string & value) {
- if (value == "json") {
- params.log_json = true;
- } else if (value == "text") {
- params.log_json = false;
- } else {
- throw std::invalid_argument("invalid value");
- }
- }
- ).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(llama_arg(
{"--metrics"},
format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
else { std::invalid_argument("invalid value"); }
}
).set_examples({LLAMA_EXAMPLE_BENCH}));
-#ifndef LOG_DISABLE_LOGS
- // TODO: make this looks less weird
- add_opt(llama_arg(
- {"--log-test"},
- "Log test",
- [](gpt_params &) { log_param_single_parse("--log-test"); }
- ));
add_opt(llama_arg(
{"--log-disable"},
"Log disable",
- [](gpt_params &) { log_param_single_parse("--log-disable"); }
+ [](gpt_params &) {
+ gpt_log_pause(gpt_log_main());
+ }
));
add_opt(llama_arg(
- {"--log-enable"},
- "Log enable",
- [](gpt_params &) { log_param_single_parse("--log-enable"); }
+ {"--log-file"}, "FNAME",
+ "Log to file",
+ [](gpt_params &, const std::string & value) {
+ gpt_log_set_file(gpt_log_main(), value.c_str());
+ }
));
add_opt(llama_arg(
- {"--log-new"},
- "Log new",
- [](gpt_params &) { log_param_single_parse("--log-new"); }
- ));
+ {"--log-colors"},
+ "Enable colored logging",
+ [](gpt_params &) {
+ gpt_log_set_colors(gpt_log_main(), true);
+ }
+ ).set_env("LLAMA_LOG_COLORS"));
add_opt(llama_arg(
- {"--log-append"},
- "Log append",
- [](gpt_params &) { log_param_single_parse("--log-append"); }
+ {"-v", "--verbose", "--log-verbose"},
+ "Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
+ [](gpt_params & params) {
+ params.verbosity = INT_MAX;
+ gpt_log_set_verbosity_thold(INT_MAX);
+ }
));
add_opt(llama_arg(
- {"--log-file"}, "FNAME",
- "Log file",
- [](gpt_params &, const std::string & value) { log_param_pair_parse(false, "--log-file", value); }
- ));
-#endif // LOG_DISABLE_LOGS
+ {"-lv", "--verbosity", "--log-verbosity"}, "N",
+ "Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
+ [](gpt_params & params, int value) {
+ params.verbosity = value;
+ gpt_log_set_verbosity_thold(value);
+ }
+ ).set_env("LLAMA_LOG_VERBOSITY"));
+ add_opt(llama_arg(
+ {"--log-prefix"},
+ "Enable prefx in log messages",
+ [](gpt_params &) {
+ gpt_log_set_prefix(gpt_log_main(), true);
+ }
+ ).set_env("LLAMA_LOG_PREFIX"));
+ add_opt(llama_arg(
+ {"--log-timestamps"},
+ "Enable timestamps in log messages",
+ [](gpt_params &) {
+ gpt_log_set_timestamps(gpt_log_main(), true);
+ }
+ ).set_env("LLAMA_LOG_TIMESTAMPS"));
return ctx_arg;
}
#endif
#include "common.h"
+#include "log.h"
// Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT
#include "json.hpp"
#include <unordered_map>
#include <unordered_set>
#include <vector>
+#include <thread>
#if defined(__APPLE__) && defined(__MACH__)
#include <sys/types.h>
#if defined(LLAMA_USE_CURL)
#include <curl/curl.h>
#include <curl/easy.h>
-#include <thread>
#include <future>
#endif
}
if (!SetPriorityClass(GetCurrentProcess(), p)) {
- fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
+ LOG_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
return false;
}
}
if (!setpriority(PRIO_PROCESS, 0, p)) {
- fprintf(stderr, "warn: failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
+ LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
return false;
}
return true;
if (n_set && n_set < cpuparams.n_threads) {
// Not enough set bits, may experience performance issues.
- fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
+ LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
}
}
bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
size_t dash_loc = range.find('-');
if (dash_loc == std::string::npos) {
- fprintf(stderr, "Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
+ LOG_ERR("Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
return false;
}
} else {
start_i = std::stoull(range.substr(0, dash_loc));
if (start_i >= GGML_MAX_N_THREADS) {
- fprintf(stderr, "Start index out of bounds!\n");
+ LOG_ERR("Start index out of bounds!\n");
return false;
}
}
} else {
end_i = std::stoull(range.substr(dash_loc + 1));
if (end_i >= GGML_MAX_N_THREADS) {
- fprintf(stderr, "End index out of bounds!\n");
+ LOG_ERR("End index out of bounds!\n");
return false;
}
}
} else if (c >= 'A' && c <= 'F') {
id -= 'A' - 10;
} else {
- fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i));
+ LOG_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i));
return false;
}
return true;
}
+void gpt_init() {
+ llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
+ if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) {
+ gpt_log_add(gpt_log_main(), level, "%s", text);
+ }
+ }, NULL);
+
+#ifdef NDEBUG
+ const char * build_type = "";
+#else
+ const char * build_type = " (debug)";
+#endif
+
+ LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
+}
+
std::string gpt_params_get_system_info(const gpt_params & params) {
std::ostringstream os;
s = std::move(builder);
}
+std::string string_from(bool value) {
+ return value ? "true" : "false";
+}
+
+std::string string_from(const std::vector<int> & values) {
+ std::stringstream buf;
+
+ buf << "[ ";
+ bool first = true;
+ for (auto e : values) {
+ if (first) {
+ first = false;
+ } else {
+ buf << ", ";
+ }
+ buf << std::to_string(e);
+ }
+ buf << " ]";
+
+ return buf.str();
+}
+
+std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
+ std::stringstream buf;
+
+ buf << "[ ";
+
+ bool first = true;
+ for (const auto & token : tokens) {
+ if (!first) {
+ buf << ", ";
+ } else {
+ first = false;
+ }
+
+ auto detokenized = llama_token_to_piece(ctx, token);
+
+ detokenized.erase(
+ std::remove_if(
+ detokenized.begin(),
+ detokenized.end(),
+ [](const unsigned char c) { return !std::isprint(c); }),
+ detokenized.end());
+
+ buf << "'" << detokenized << "'"
+ << ":" << std::to_string(token);
+ }
+
+ buf << " ]";
+
+ return buf.str();
+}
+
+std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) {
+ std::stringstream buf;
+
+ buf << "[ ";
+
+ bool first = true;
+ for (int i = 0; i < batch.n_tokens; ++i) {
+ if (!first) {
+ buf << ", ";
+ } else {
+ first = false;
+ }
+
+ auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
+
+ detokenized.erase(
+ std::remove_if(
+ detokenized.begin(),
+ detokenized.end(),
+ [](const unsigned char c) { return !std::isprint(c); }),
+ detokenized.end());
+
+ buf << "\n" << std::to_string(i)
+ << ":token '" << detokenized << "'"
+ << ":pos " << std::to_string(batch.pos[i])
+ << ":n_seq_id " << std::to_string(batch.n_seq_id[i])
+ << ":seq_id " << std::to_string(batch.seq_id[i][0])
+ << ":logits " << std::to_string(batch.logits[i]);
+ }
+
+ buf << " ]";
+
+ return buf.str();
+}
+
void string_process_escapes(std::string & input) {
std::size_t input_len = input.length();
std::size_t output_idx = 0;
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
const char * sep = strchr(data, '=');
if (sep == nullptr || sep - data >= 128) {
- fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
+ LOG_ERR("%s: malformed KV override '%s'\n", __func__, data);
return false;
}
llama_model_kv_override kvo;
} else if (std::strcmp(sep, "false") == 0) {
kvo.val_bool = false;
} else {
- fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
+ LOG_ERR("%s: invalid boolean value for KV override '%s'\n", __func__, data);
return false;
}
} else if (strncmp(sep, "str:", 4) == 0) {
sep += 4;
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
if (strlen(sep) > 127) {
- fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
+ LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
return false;
}
strncpy(kvo.val_str, sep, 127);
kvo.val_str[127] = '\0';
} else {
- fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
+ LOG_ERR("%s: invalid type for KV override '%s'\n", __func__, data);
return false;
}
overrides.emplace_back(std::move(kvo));
}
if (model == NULL) {
- fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+ LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
return iparams;
}
llama_context * lctx = llama_new_context_with_model(model, cparams);
if (lctx == NULL) {
- fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
+ LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
llama_free_model(model);
return iparams;
}
loaded_la.scale = la.scale;
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
if (loaded_la.adapter == nullptr) {
- fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
+ LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
llama_free(lctx);
llama_free_model(model);
return iparams;
}
if (params.sparams.ignore_eos && llama_token_eos(model) == -1) {
- fprintf(stderr, "%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
+ LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
params.sparams.ignore_eos = false;
}
if (params.warmup) {
- LOG("warming up the model with an empty run\n");
+ LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
std::vector<llama_token> tmp;
llama_token bos = llama_token_bos(model);
int remaining_attempts = max_attempts;
while (remaining_attempts > 0) {
- fprintf(stderr, "%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
+ LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
CURLcode res = curl_easy_perform(curl);
if (res == CURLE_OK) {
}
int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
- fprintf(stderr, "%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
+ LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
remaining_attempts--;
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
}
- fprintf(stderr, "%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
+ LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
+
return false;
}
// Initialize libcurl
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
if (!curl) {
- fprintf(stderr, "%s: error initializing libcurl\n", __func__);
+ LOG_ERR("%s: error initializing libcurl\n", __func__);
return false;
}
if (metadata_in.good()) {
try {
metadata_in >> metadata;
- fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
+ LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
if (metadata.contains("url") && metadata.at("url").is_string()) {
auto previous_url = metadata.at("url").get<std::string>();
if (previous_url != url) {
- fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
+ LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
return false;
}
}
last_modified = metadata.at("lastModified");
}
} catch (const nlohmann::json::exception & e) {
- fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
+ LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
return false;
}
}
} else {
- fprintf(stderr, "%s: no previous model file found %s\n", __func__, path.c_str());
+ LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
}
// Send a HEAD request to retrieve the etag and last-modified headers
// HEAD not supported, we don't know if the file has changed
// force trigger downloading
force_download = true;
- fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
+ LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
}
}
bool should_download = !file_exists || force_download;
if (!should_download) {
if (!etag.empty() && etag != headers.etag) {
- fprintf(stderr, "%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
+ LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
should_download = true;
} else if (!last_modified.empty() && last_modified != headers.last_modified) {
- fprintf(stderr, "%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
+ LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
should_download = true;
}
}
if (should_download) {
std::string path_temporary = path + ".downloadInProgress";
if (file_exists) {
- fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
+ LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
if (remove(path.c_str()) != 0) {
- fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path.c_str());
+ LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
return false;
}
}
std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
if (!outfile) {
- fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str());
+ LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
return false;
}
};
// start the download
- fprintf(stderr, "%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
+ LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
if (!was_perform_successful) {
long http_code = 0;
curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
if (http_code < 200 || http_code >= 400) {
- fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code);
+ LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
return false;
}
{"lastModified", headers.last_modified}
});
std::ofstream(metadata_path) << metadata.dump(4);
- fprintf(stderr, "%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
+ LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
- fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
+ LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
return false;
}
}
const struct llama_model_params & params) {
// Basic validation of the model_url
if (!model_url || strlen(model_url) == 0) {
- fprintf(stderr, "%s: invalid model_url\n", __func__);
+ LOG_ERR("%s: invalid model_url\n", __func__);
return NULL;
}
};
auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
if (!ctx_gguf) {
- fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, path_model);
+ LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, path_model);
return NULL;
}
// and extract split URL and PATH prefixes
{
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
- fprintf(stderr, "\n%s: unexpected model file name: %s"
- " n_split=%d\n", __func__, path_model, n_split);
+ LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
return NULL;
}
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
- fprintf(stderr, "\n%s: unexpected model url: %s"
- " n_split=%d\n", __func__, model_url, n_split);
+ LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
return NULL;
}
}
const char * /*path_model*/,
const char * /*hf_token*/,
const struct llama_model_params & /*params*/) {
- fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
+ LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
return nullptr;
}
const char * /*path_model*/,
const char * /*hf_token*/,
const struct llama_model_params & /*params*/) {
- fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
+ LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
return nullptr;
}
};
struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
if (!ctx_gguf) {
- fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
+ LOG_ERR("%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
return result;
}
int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
if (n_tensors == 0) {
- fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
+ LOG_WRN("%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
}
for (int i = 0; i < n_tensors; i++) {
}
}
if (layer_idx < 0) {
- fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
+ LOG_ERR("%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
result.n_embd = -1;
break;
} else if (layer_idx == 0) {
- fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
+ LOG_ERR("%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
result.n_embd = -1;
break;
}
struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
if (tensor->type != GGML_TYPE_F32) {
- fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
+ LOG_ERR("%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
result.n_embd = -1;
break;
}
if (ggml_n_dims(tensor) != 1) {
- fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
+ LOG_ERR("%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
result.n_embd = -1;
break;
}
if (result.n_embd == -1) {
result.n_embd = ggml_nelements(tensor);
} else if (ggml_nelements(tensor) != result.n_embd) {
- fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
+ LOG_ERR("%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
result.n_embd = -1;
break;
}
}
if (result.n_embd == -1) {
- fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
+ LOG_WRN("%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
result.data.clear();
}
break;
}
if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
- fprintf(stderr, "%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
+ LOG_ERR("%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
result.n_embd = -1;
break;
}
}
if (result.n_embd == -1) {
- fprintf(stderr, "%s: no valid control vector files passed\n", __func__);
+ LOG_ERR("%s: no valid control vector files passed\n", __func__);
result.data.clear();
}
#include "llama.h"
-#define LOG_NO_FILE_LINE_FUNCTION
-#include "log.h"
-
#include <string>
#include <vector>
+#include <sstream>
#ifdef _WIN32
#define DIRECTORY_SEPARATOR '\\'
bool batched_bench_output_jsonl = false;
};
+// call once at the start of a program if it uses libcommon
+// initializes the logging system and prints info about the build
+void gpt_init();
+
std::string gpt_params_get_system_info(const gpt_params & params);
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
void string_process_escapes(std::string & input);
+std::string string_from(bool value);
+std::string string_from(const std::vector<int> & values);
+std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
+std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
+
//
// Filesystem utils
//
--- /dev/null
+#include "log.h"
+
+#include <condition_variable>
+#include <cstdarg>
+#include <cstdio>
+#include <mutex>
+#include <sstream>
+#include <thread>
+#include <vector>
+
+int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA;
+
+void gpt_log_set_verbosity_thold(int verbosity) {
+ gpt_log_verbosity_thold = verbosity;
+}
+
+#define LOG_COL_DEFAULT "\033[0m"
+#define LOG_COL_BOLD "\033[1m"
+#define LOG_COL_RED "\033[31m"
+#define LOG_COL_GREEN "\033[32m"
+#define LOG_COL_YELLOW "\033[33m"
+#define LOG_COL_BLUE "\033[34m"
+#define LOG_COL_MAGENTA "\033[35m"
+#define LOG_COL_CYAN "\033[36m"
+#define LOG_COL_WHITE "\033[37m"
+
+static int64_t t_us() {
+ return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+}
+
+// colors
+enum gpt_log_col : int {
+ GPT_LOG_COL_DEFAULT = 0,
+ GPT_LOG_COL_BOLD,
+ GPT_LOG_COL_RED,
+ GPT_LOG_COL_GREEN,
+ GPT_LOG_COL_YELLOW,
+ GPT_LOG_COL_BLUE,
+ GPT_LOG_COL_MAGENTA,
+ GPT_LOG_COL_CYAN,
+ GPT_LOG_COL_WHITE,
+};
+
+// disable colors by default
+static std::vector<const char *> g_col = {
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+};
+
+struct gpt_log_entry {
+ enum ggml_log_level level;
+
+ bool prefix;
+
+ int64_t timestamp;
+
+ std::vector<char> msg;
+
+ // signals the worker thread to stop
+ bool is_end;
+
+ void print(FILE * file = nullptr) const {
+ FILE * fcur = file;
+ if (!fcur) {
+ // stderr displays DBG messages only when their verbosity level is not higher than the threshold
+ // these messages will still be logged to a file
+ if (level == GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
+ return;
+ }
+
+ fcur = stdout;
+
+ if (level != GGML_LOG_LEVEL_NONE) {
+ fcur = stderr;
+ }
+ }
+
+ if (level != GGML_LOG_LEVEL_NONE && prefix) {
+ if (timestamp) {
+ // [M.s.ms.us]
+ fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
+ g_col[GPT_LOG_COL_BLUE],
+ (int) (timestamp / 1000000 / 60),
+ (int) (timestamp / 1000000 % 60),
+ (int) (timestamp / 1000 % 1000),
+ (int) (timestamp % 1000),
+ g_col[GPT_LOG_COL_DEFAULT]);
+ }
+
+ switch (level) {
+ case GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN], g_col[GPT_LOG_COL_DEFAULT]); break;
+ case GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], "" ); break;
+ case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED], "" ); break;
+ case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW], "" ); break;
+ default:
+ break;
+ }
+ }
+
+ fprintf(fcur, "%s", msg.data());
+
+ if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) {
+ fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]);
+ }
+
+ fflush(fcur);
+ }
+};
+
+struct gpt_log {
+ // default capacity - will be expanded if needed
+ gpt_log() : gpt_log(256) {}
+
+ gpt_log(size_t capacity) {
+ file = nullptr;
+ prefix = false;
+ timestamps = false;
+ running = false;
+ t_start = t_us();
+
+ // initial message size - will be expanded if longer messages arrive
+ entries.resize(capacity);
+ for (auto & entry : entries) {
+ entry.msg.resize(256);
+ }
+
+ head = 0;
+ tail = 0;
+
+ resume();
+ }
+
+ ~gpt_log() {
+ pause();
+ if (file) {
+ fclose(file);
+ }
+ }
+
+private:
+ std::mutex mtx;
+ std::thread thrd;
+ std::condition_variable cv;
+
+ FILE * file;
+
+ bool prefix;
+ bool timestamps;
+ bool running;
+
+ int64_t t_start;
+
+ // ring buffer of entries
+ std::vector<gpt_log_entry> entries;
+ size_t head;
+ size_t tail;
+
+ // worker thread copies into this
+ gpt_log_entry cur;
+
+public:
+ void add(enum ggml_log_level level, const char * fmt, va_list args) {
+ std::lock_guard<std::mutex> lock(mtx);
+
+ if (!running) {
+ // discard messages while the worker thread is paused
+ return;
+ }
+
+ auto & entry = entries[tail];
+
+ {
+ // cannot use args twice, so make a copy in case we need to expand the buffer
+ va_list args_copy;
+ va_copy(args_copy, args);
+
+#if 1
+ const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args);
+ if (n >= entry.msg.size()) {
+ entry.msg.resize(n + 1);
+ vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args_copy);
+ }
+#else
+ // hack for bolding arguments
+
+ std::stringstream ss;
+ for (int i = 0; fmt[i] != 0; i++) {
+ if (fmt[i] == '%') {
+ ss << LOG_COL_BOLD;
+ while (fmt[i] != ' ' && fmt[i] != ')' && fmt[i] != ']' && fmt[i] != 0) ss << fmt[i++];
+ ss << LOG_COL_DEFAULT;
+ if (fmt[i] == 0) break;
+ }
+ ss << fmt[i];
+ }
+ const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args);
+ if (n >= entry.msg.size()) {
+ entry.msg.resize(n + 1);
+ vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
+ }
+#endif
+ }
+
+ entry.level = level;
+ entry.prefix = prefix;
+ entry.timestamp = 0;
+ if (timestamps) {
+ entry.timestamp = t_us() - t_start;
+ }
+ entry.is_end = false;
+
+ tail = (tail + 1) % entries.size();
+ if (tail == head) {
+ // expand the buffer
+ std::vector<gpt_log_entry> new_entries(2*entries.size());
+
+ size_t new_tail = 0;
+
+ do {
+ new_entries[new_tail] = std::move(entries[head]);
+
+ head = (head + 1) % entries.size();
+ new_tail = (new_tail + 1);
+ } while (head != tail);
+
+ head = 0;
+ tail = new_tail;
+
+ for (size_t i = tail; i < new_entries.size(); i++) {
+ new_entries[i].msg.resize(256);
+ }
+
+ entries = std::move(new_entries);
+ }
+
+ cv.notify_one();
+ }
+
+ void resume() {
+ std::lock_guard<std::mutex> lock(mtx);
+
+ if (running) {
+ return;
+ }
+
+ running = true;
+
+ thrd = std::thread([this]() {
+ while (true) {
+ {
+ std::unique_lock<std::mutex> lock(mtx);
+ cv.wait(lock, [this]() { return head != tail; });
+
+ cur = entries[head];
+
+ head = (head + 1) % entries.size();
+ }
+
+ if (cur.is_end) {
+ break;
+ }
+
+ cur.print(); // stdout and stderr
+
+ if (file) {
+ cur.print(file);
+ }
+ }
+ });
+ }
+
+ void pause() {
+ {
+ std::lock_guard<std::mutex> lock(mtx);
+
+ if (!running) {
+ return;
+ }
+
+ running = false;
+
+ // push an entry to signal the worker thread to stop
+ {
+ auto & entry = entries[tail];
+ entry.is_end = true;
+
+ tail = (tail + 1) % entries.size();
+ }
+
+ cv.notify_one();
+ }
+
+ thrd.join();
+ }
+
+ void set_file(const char * path) {
+ pause();
+
+ if (file) {
+ fclose(file);
+ }
+
+ if (path) {
+ file = fopen(path, "w");
+ } else {
+ file = nullptr;
+ }
+
+ resume();
+ }
+
+ void set_colors(bool colors) {
+ pause();
+
+ if (colors) {
+ g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
+ g_col[GPT_LOG_COL_BOLD] = LOG_COL_BOLD;
+ g_col[GPT_LOG_COL_RED] = LOG_COL_RED;
+ g_col[GPT_LOG_COL_GREEN] = LOG_COL_GREEN;
+ g_col[GPT_LOG_COL_YELLOW] = LOG_COL_YELLOW;
+ g_col[GPT_LOG_COL_BLUE] = LOG_COL_BLUE;
+ g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
+ g_col[GPT_LOG_COL_CYAN] = LOG_COL_CYAN;
+ g_col[GPT_LOG_COL_WHITE] = LOG_COL_WHITE;
+ } else {
+ for (size_t i = 0; i < g_col.size(); i++) {
+ g_col[i] = "";
+ }
+ }
+
+ resume();
+ }
+
+ void set_prefix(bool prefix) {
+ std::lock_guard<std::mutex> lock(mtx);
+
+ this->prefix = prefix;
+ }
+
+ void set_timestamps(bool timestamps) {
+ std::lock_guard<std::mutex> lock(mtx);
+
+ this->timestamps = timestamps;
+ }
+};
+
+//
+// public API
+//
+
+struct gpt_log * gpt_log_init() {
+ return new gpt_log;
+}
+
+struct gpt_log * gpt_log_main() {
+ static struct gpt_log log;
+
+ return &log;
+}
+
+void gpt_log_pause(struct gpt_log * log) {
+ log->pause();
+}
+
+void gpt_log_resume(struct gpt_log * log) {
+ log->resume();
+}
+
+void gpt_log_free(struct gpt_log * log) {
+ delete log;
+}
+
+void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...) {
+ va_list args;
+ va_start(args, fmt);
+ log->add(level, fmt, args);
+ va_end(args);
+}
+
+void gpt_log_set_file(struct gpt_log * log, const char * file) {
+ log->set_file(file);
+}
+
+void gpt_log_set_colors(struct gpt_log * log, bool colors) {
+ log->set_colors(colors);
+}
+
+void gpt_log_set_prefix(struct gpt_log * log, bool prefix) {
+ log->set_prefix(prefix);
+}
+
+void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) {
+ log->set_timestamps(timestamps);
+}
#pragma once
-#include <chrono>
-#include <cstring>
-#include <sstream>
-#include <iostream>
-#include <thread>
-#include <vector>
-#include <algorithm>
-#include <cinttypes>
+#include "ggml.h" // for ggml_log_level
-// --------------------------------
-//
-// Basic usage:
-//
-// --------
-//
-// The LOG() and LOG_TEE() macros are ready to go by default
-// they do not require any initialization.
-//
-// LOGLN() and LOG_TEELN() are variants which automatically
-// include \n character at the end of the log string.
-//
-// LOG() behaves exactly like printf, by default writing to a logfile.
-// LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ).
-//
-// Default logfile is named
-// "llama.<threadID>.log"
-// Default LOG_TEE() secondary output target is
-// stderr
-//
-// Logs can be dynamically disabled or enabled using functions:
-// log_disable()
-// and
-// log_enable()
-//
-// A log target can be changed with:
-// log_set_target( string )
-// creating and opening, or re-opening a file by string filename
-// or
-// log_set_target( FILE* )
-// allowing to point at stderr, stdout, or any valid FILE* file handler.
-//
-// --------
-//
-// End of Basic usage.
-//
-// --------------------------------
-
-// Specifies a log target.
-// default uses log_handler() with "llama.log" log file
-// this can be changed, by defining LOG_TARGET
-// like so:
-//
-// #define LOG_TARGET (a valid FILE*)
-// #include "log.h"
-//
-// or it can be simply redirected to stdout or stderr
-// like so:
-//
-// #define LOG_TARGET stderr
-// #include "log.h"
-//
-// The log target can also be redirected to a different function
-// like so:
-//
-// #define LOG_TARGET log_handler_different()
-// #include "log.h"
-//
-// FILE* log_handler_different()
-// {
-// return stderr;
-// }
-//
-// or:
-//
-// #define LOG_TARGET log_handler_another_one("somelog.log")
-// #include "log.h"
-//
-// FILE* log_handler_another_one(char*filename)
-// {
-// static FILE* logfile = nullptr;
-// (...)
-// if( !logfile )
-// {
-// fopen(...)
-// }
-// (...)
-// return logfile
-// }
-//
-#ifndef LOG_TARGET
- #define LOG_TARGET log_handler()
-#endif
-
-#ifndef LOG_TEE_TARGET
- #define LOG_TEE_TARGET stderr
+#ifndef __GNUC__
+# define LOG_ATTRIBUTE_FORMAT(...)
+#elif defined(__MINGW32__)
+# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#else
+# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
#endif
-// Utility for synchronizing log configuration state
-// since std::optional was introduced only in c++17
-enum LogTriState
-{
- LogTriStateSame,
- LogTriStateFalse,
- LogTriStateTrue
-};
-
-// Utility to obtain "pid" like unique process id and use it when creating log files.
-inline std::string log_get_pid()
-{
- static std::string pid;
- if (pid.empty())
- {
- // std::this_thread::get_id() is the most portable way of obtaining a "process id"
- // it's not the same as "pid" but is unique enough to solve multiple instances
- // trying to write to the same log.
- std::stringstream ss;
- ss << std::this_thread::get_id();
- pid = ss.str();
- }
-
- return pid;
-}
-
-// Utility function for generating log file names with unique id based on thread id.
-// invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
-// where the number is a runtime id of the current thread.
-
-#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension)
-
-// INTERNAL, DO NOT USE
-inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension)
-{
- static bool _multilog = false;
-
- if (multilog != LogTriStateSame)
- {
- _multilog = multilog == LogTriStateTrue;
- }
+#define LOG_DEFAULT_DEBUG 1
+#define LOG_DEFAULT_LLAMA 0
- std::stringstream buf;
+// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
+// set via gpt_log_set_verbosity()
+extern int gpt_log_verbosity_thold;
- buf << log_file_basename;
- if (_multilog)
- {
- buf << ".";
- buf << log_get_pid();
- }
- buf << ".";
- buf << log_file_extension;
+void gpt_log_set_verbosity_thold(int verbosity); // not thread-safe
- return buf.str();
-}
+// the gpt_log uses an internal worker thread to print/write log messages
+// when the worker thread is paused, incoming log messages are discarded
+struct gpt_log;
-#ifndef LOG_DEFAULT_FILE_NAME
- #define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log")
-#endif
-
-// Utility for turning #define values into string literals
-// so we can have a define for stderr and
-// we can print "stderr" instead of literal stderr, etc.
-#define LOG_STRINGIZE1(s) #s
-#define LOG_STRINGIZE(s) LOG_STRINGIZE1(s)
+struct gpt_log * gpt_log_init();
+struct gpt_log * gpt_log_main(); // singleton, automatically destroys itself on exit
+void gpt_log_pause (struct gpt_log * log); // pause the worker thread, not thread-safe
+void gpt_log_resume(struct gpt_log * log); // resume the worker thread, not thread-safe
+void gpt_log_free (struct gpt_log * log);
-#define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET)
+LOG_ATTRIBUTE_FORMAT(3, 4)
+void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...);
-// Allows disabling timestamps.
-// in order to disable, define LOG_NO_TIMESTAMPS
-// like so:
+// defaults: file = NULL, colors = false, prefix = false, timestamps = false
//
-// #define LOG_NO_TIMESTAMPS
-// #include "log.h"
+// regular log output:
//
-#ifndef LOG_NO_TIMESTAMPS
- #ifndef _MSC_VER
- #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
- #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
- #else
- #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
- #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
- #endif
-#else
- #define LOG_TIMESTAMP_FMT "%s"
- #define LOG_TIMESTAMP_VAL ,""
-#endif
-
-#ifdef LOG_TEE_TIMESTAMPS
- #ifndef _MSC_VER
- #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
- #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
- #else
- #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
- #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
- #endif
-#else
- #define LOG_TEE_TIMESTAMP_FMT "%s"
- #define LOG_TEE_TIMESTAMP_VAL ,""
-#endif
-
-// Allows disabling file/line/function prefix
-// in order to disable, define LOG_NO_FILE_LINE_FUNCTION
-// like so:
+// ggml_backend_metal_log_allocated_size: allocated buffer, size = 6695.84 MiB, ( 6695.91 / 21845.34)
+// llm_load_tensors: ggml ctx size = 0.27 MiB
+// llm_load_tensors: offloading 32 repeating layers to GPU
+// llm_load_tensors: offloading non-repeating layers to GPU
//
-// #define LOG_NO_FILE_LINE_FUNCTION
-// #include "log.h"
+// with prefix = true, timestamps = true, the log output will look like this:
//
-#ifndef LOG_NO_FILE_LINE_FUNCTION
- #ifndef _MSC_VER
- #define LOG_FLF_FMT "[%24s:%5d][%24s] "
- #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
- #else
- #define LOG_FLF_FMT "[%24s:%5ld][%24s] "
- #define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
- #endif
-#else
- #define LOG_FLF_FMT "%s"
- #define LOG_FLF_VAL ,""
-#endif
-
-#ifdef LOG_TEE_FILE_LINE_FUNCTION
- #ifndef _MSC_VER
- #define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] "
- #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
- #else
- #define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
- #define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
- #endif
-#else
- #define LOG_TEE_FLF_FMT "%s"
- #define LOG_TEE_FLF_VAL ,""
-#endif
-
-// INTERNAL, DO NOT USE
-// USE LOG() INSTEAD
+// 0.00.035.060 D ggml_backend_metal_log_allocated_size: allocated buffer, size = 6695.84 MiB, ( 6695.91 / 21845.34)
+// 0.00.035.064 I llm_load_tensors: ggml ctx size = 0.27 MiB
+// 0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU
+// 0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU
//
-#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
- #define LOG_IMPL(str, ...) \
- do { \
- if (LOG_TARGET != nullptr) \
- { \
- fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
- fflush(LOG_TARGET); \
- } \
- } while (0)
-#else
- #define LOG_IMPL(str, ...) \
- do { \
- if (LOG_TARGET != nullptr) \
- { \
- fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
- fflush(LOG_TARGET); \
- } \
- } while (0)
-#endif
-
-// INTERNAL, DO NOT USE
-// USE LOG_TEE() INSTEAD
+// I - info (stdout, V = 0)
+// W - warning (stderr, V = 0)
+// E - error (stderr, V = 0)
+// D - debug (stderr, V = LOG_DEFAULT_DEBUG)
//
-#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
- #define LOG_TEE_IMPL(str, ...) \
- do { \
- if (LOG_TARGET != nullptr) \
- { \
- fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
- fflush(LOG_TARGET); \
- } \
- if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \
- { \
- fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \
- fflush(LOG_TEE_TARGET); \
- } \
- } while (0)
-#else
- #define LOG_TEE_IMPL(str, ...) \
- do { \
- if (LOG_TARGET != nullptr) \
- { \
- fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
- fflush(LOG_TARGET); \
- } \
- if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \
- { \
- fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \
- fflush(LOG_TEE_TARGET); \
- } \
- } while (0)
-#endif
-// The '\0' as a last argument, is a trick to bypass the silly
-// "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro"
-// so we can have a single macro which can be called just like printf.
+void gpt_log_set_file (struct gpt_log * log, const char * file); // not thread-safe
+void gpt_log_set_colors (struct gpt_log * log, bool colors); // not thread-safe
+void gpt_log_set_prefix (struct gpt_log * log, bool prefix); // whether to output prefix to each log
+void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // whether to output timestamps in the prefix
-// Main LOG macro.
-// behaves like printf, and supports arguments the exact same way.
+// helper macros for logging
+// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
//
-#if !defined(_MSC_VER) || defined(__clang__)
- #define LOG(...) LOG_IMPL(__VA_ARGS__, "")
-#else
- #define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
-#endif
-
-// Main TEE macro.
-// does the same as LOG
-// and
-// simultaneously writes stderr.
+// for example:
//
-// Secondary target can be changed just like LOG_TARGET
-// by defining LOG_TEE_TARGET
+// LOG_DBG("this is a debug message: %d\n", expensive_function());
+//
+// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > gpt_log_verbosity_thold
//
-#if !defined(_MSC_VER) || defined(__clang__)
- #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
-#else
- #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
-#endif
-
-// LOG macro variants with auto endline.
-#if !defined(_MSC_VER) || defined(__clang__)
- #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
- #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
-#else
- #define LOGLN(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
- #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
-#endif
-
-// INTERNAL, DO NOT USE
-inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
-{
- static bool _initialized = false;
- static bool _append = false;
- static bool _disabled = filename.empty() && target == nullptr;
- static std::string log_current_filename{filename};
- static FILE *log_current_target{target};
- static FILE *logfile = nullptr;
-
- if (change)
- {
- if (append != LogTriStateSame)
- {
- _append = append == LogTriStateTrue;
- return logfile;
- }
-
- if (disable == LogTriStateTrue)
- {
- // Disable primary target
- _disabled = true;
- }
- // If previously disabled, only enable, and keep previous target
- else if (disable == LogTriStateFalse)
- {
- _disabled = false;
- }
- // Otherwise, process the arguments
- else if (log_current_filename != filename || log_current_target != target)
- {
- _initialized = false;
- }
- }
-
- if (_disabled)
- {
- // Log is disabled
- return nullptr;
- }
-
- if (_initialized)
- {
- // with fallback in case something went wrong
- return logfile ? logfile : stderr;
- }
-
- // do the (re)initialization
- if (target != nullptr)
- {
- if (logfile != nullptr && logfile != stdout && logfile != stderr)
- {
- fclose(logfile);
- }
-
- log_current_filename = LOG_DEFAULT_FILE_NAME;
- log_current_target = target;
-
- logfile = target;
- }
- else
- {
- if (log_current_filename != filename)
- {
- if (logfile != nullptr && logfile != stdout && logfile != stderr)
- {
- fclose(logfile);
- }
- }
-
- logfile = fopen(filename.c_str(), _append ? "a" : "w");
- }
-
- if (!logfile)
- {
- // Verify whether the file was opened, otherwise fallback to stderr
- logfile = stderr;
-
- fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno));
- fflush(stderr);
-
- // At this point we let the init flag be to true below, and let the target fallback to stderr
- // otherwise we would repeatedly fopen() which was already unsuccessful
- }
-
- _initialized = true;
-
- return logfile ? logfile : stderr;
-}
-
-// INTERNAL, DO NOT USE
-inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
-{
- return log_handler1_impl(change, append, disable, filename, target);
-}
-
-// Disables logs entirely at runtime.
-// Makes LOG() and LOG_TEE() produce no output,
-// until enabled back.
-#define log_disable() log_disable_impl()
-
-// INTERNAL, DO NOT USE
-inline FILE *log_disable_impl()
-{
- return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue);
-}
-
-// Enables logs at runtime.
-#define log_enable() log_enable_impl()
-
-// INTERNAL, DO NOT USE
-inline FILE *log_enable_impl()
-{
- return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse);
-}
-
-// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
-#define log_set_target(target) log_set_target_impl(target)
-
-// INTERNAL, DO NOT USE
-inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); }
-inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); }
-
-// INTERNAL, DO NOT USE
-inline FILE *log_handler() { return log_handler1_impl(); }
-
-// Enable or disable creating separate log files for each run.
-// can ONLY be invoked BEFORE first log use.
-#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "")
-// Enable or disable append mode for log file.
-// can ONLY be invoked BEFORE first log use.
-#define log_append(enable) log_append_impl(enable)
-// INTERNAL, DO NOT USE
-inline FILE *log_append_impl(bool enable)
-{
- return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame);
-}
-
-inline void log_test()
-{
- log_disable();
- LOG("01 Hello World to nobody, because logs are disabled!\n");
- log_enable();
- LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET));
- LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n");
- log_set_target(stderr);
- LOG("04 Hello World to stderr!\n");
- LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n");
- log_set_target(LOG_DEFAULT_FILE_NAME);
- LOG("06 Hello World to default log file!\n");
- log_set_target(stdout);
- LOG("07 Hello World to stdout!\n");
- log_set_target(LOG_DEFAULT_FILE_NAME);
- LOG("08 Hello World to default log file again!\n");
- log_disable();
- LOG("09 Hello World _1_ into the void!\n");
- log_enable();
- LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n");
- log_disable();
- log_set_target("llama.anotherlog.log");
- LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n");
- log_enable();
- LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n");
- log_set_target("llama.yetanotherlog.log");
- LOG("13 Hello World this time in yet new file?\n");
- log_set_target(log_filename_generator("llama_autonamed", "log"));
- LOG("14 Hello World in log with generated filename!\n");
-#ifdef _MSC_VER
- LOG_TEE("15 Hello msvc TEE without arguments\n");
- LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test");
- LOG_TEELN("17 Hello msvc TEELN without arguments\n");
- LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test");
- LOG("19 Hello msvc LOG without arguments\n");
- LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test");
- LOGLN("21 Hello msvc LOGLN without arguments\n");
- LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test");
-#endif
-}
-
-inline bool log_param_single_parse(const std::string & param)
-{
- if ( param == "--log-test")
- {
- log_test();
- return true;
- }
-
- if ( param == "--log-disable")
- {
- log_disable();
- return true;
- }
-
- if ( param == "--log-enable")
- {
- log_enable();
- return true;
- }
-
- if (param == "--log-new")
- {
- log_multilog(true);
- return true;
- }
-
- if (param == "--log-append")
- {
- log_append(true);
- return true;
- }
-
- return false;
-}
-
-inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string())
-{
- if ( param == "--log-file")
- {
- if (!check_but_dont_parse)
- {
- log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log"));
- }
-
- return true;
- }
-
- return false;
-}
-
-inline void log_print_usage()
-{
- printf("log options:\n");
- /* format
- printf(" -h, --help show this help message and exit\n");*/
- /* spacing
- printf("__-param----------------Description\n");*/
- printf(" --log-test Run simple logging test\n");
- printf(" --log-disable Disable trace logs\n");
- printf(" --log-enable Enable trace logs\n");
- printf(" --log-file Specify a log filename (without extension)\n");
- printf(" --log-new Create a separate new log file on start. "
- "Each log file will have unique name: \"<name>.<ID>.log\"\n");
- printf(" --log-append Don't truncate the old log file.\n");
- printf("\n");
-}
-
-#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
-
-// INTERNAL, DO NOT USE
-inline void log_dump_cmdline_impl(int argc, char **argv)
-{
- std::stringstream buf;
- for (int i = 0; i < argc; ++i)
- {
- if (std::string(argv[i]).find(' ') != std::string::npos)
- {
- buf << " \"" << argv[i] <<"\"";
- }
- else
- {
- buf << " " << argv[i];
- }
- }
- LOGLN("Cmd:%s", buf.str().c_str());
-}
-
-#define log_tostr(var) log_var_to_string_impl(var).c_str()
-
-inline std::string log_var_to_string_impl(bool var)
-{
- return var ? "true" : "false";
-}
-
-inline std::string log_var_to_string_impl(std::string var)
-{
- return var;
-}
-
-inline std::string log_var_to_string_impl(const std::vector<int> & var)
-{
- std::stringstream buf;
- buf << "[ ";
- bool first = true;
- for (auto e : var)
- {
- if (first)
- {
- first = false;
- }
- else
- {
- buf << ", ";
- }
- buf << std::to_string(e);
- }
- buf << " ]";
-
- return buf.str();
-}
-
-template <typename C, typename T>
-inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
-{
- std::stringstream buf;
- buf << "[ ";
-
- bool first = true;
- for (const auto & token : tokens)
- {
- if (!first) {
- buf << ", ";
- } else {
- first = false;
- }
-
- auto detokenized = llama_token_to_piece(ctx, token);
-
- detokenized.erase(
- std::remove_if(
- detokenized.begin(),
- detokenized.end(),
- [](const unsigned char c) { return !std::isprint(c); }),
- detokenized.end());
-
- buf
- << "'" << detokenized << "'"
- << ":" << std::to_string(token);
- }
- buf << " ]";
-
- return buf.str();
-}
-
-template <typename C, typename B>
-inline std::string LOG_BATCH_TOSTR_PRETTY(const C & ctx, const B & batch)
-{
- std::stringstream buf;
- buf << "[ ";
-
- bool first = true;
- for (int i = 0; i < batch.n_tokens; ++i)
- {
- if (!first) {
- buf << ", ";
- } else {
- first = false;
- }
-
- auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
-
- detokenized.erase(
- std::remove_if(
- detokenized.begin(),
- detokenized.end(),
- [](const unsigned char c) { return !std::isprint(c); }),
- detokenized.end());
-
- buf
- << "\n" << std::to_string(i)
- << ":token '" << detokenized << "'"
- << ":pos " << std::to_string(batch.pos[i])
- << ":n_seq_id " << std::to_string(batch.n_seq_id[i])
- << ":seq_id " << std::to_string(batch.seq_id[i][0])
- << ":logits " << std::to_string(batch.logits[i]);
- }
- buf << " ]";
-
- return buf.str();
-}
-
-#ifdef LOG_DISABLE_LOGS
-
-#undef LOG
-#define LOG(...) // dummy stub
-#undef LOGLN
-#define LOGLN(...) // dummy stub
-
-#undef LOG_TEE
-#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
-
-#undef LOG_TEELN
-#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
-
-#undef LOG_DISABLE
-#define LOG_DISABLE() // dummy stub
-
-#undef LOG_ENABLE
-#define LOG_ENABLE() // dummy stub
-#undef LOG_ENABLE
-#define LOG_ENABLE() // dummy stub
+#define LOG_TMPL(level, verbosity, ...) \
+ do { \
+ if ((verbosity) <= gpt_log_verbosity_thold) { \
+ gpt_log_add(gpt_log_main(), (level), __VA_ARGS__); \
+ } \
+ } while (0)
-#undef LOG_SET_TARGET
-#define LOG_SET_TARGET(...) // dummy stub
+#define LOG(...) LOG_TMPL(GGML_LOG_LEVEL_NONE, 0, __VA_ARGS__)
+#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
-#undef LOG_DUMP_CMDLINE
-#define LOG_DUMP_CMDLINE(...) // dummy stub
+#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, 0, __VA_ARGS__)
+#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, 0, __VA_ARGS__)
+#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0, __VA_ARGS__)
+#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
-#endif // LOG_DISABLE_LOGS
+#define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO, verbosity, __VA_ARGS__)
+#define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN, verbosity, __VA_ARGS__)
+#define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__)
+#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
#include "common.h"
#include "log.h"
+#include <cinttypes>
#include <cstdint>
+#include <cstdio>
#include <fstream>
+#include <thread>
void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
std::vector<llama_token> & inp, int nnew, bool print_progress) {
}
std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
- std::string result = "\tlogits ";
+ std::string result = "logits ";
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
#include "train.h"
#include "common.h"
+#include <algorithm>
#include <random>
#include <sstream>
#include <functional>
+#include <cstring>
struct random_normal_distribution {
std::mt19937 gen;
#include "arg.h"
#include "common.h"
+#include "log.h"
#include "llama.h"
#include <algorithm>
#include <vector>
static void print_usage(int, char ** argv) {
- LOG_TEE("\nexample usage:\n");
- LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
- LOG_TEE("\n");
+ LOG("\nexample usage:\n");
+ LOG("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
+ LOG("\n");
}
int main(int argc, char ** argv) {
return 1;
}
+ gpt_init();
+
int is_pp_shared = params.is_pp_shared;
std::vector<int> n_pp = params.n_pp;
const int ret = llama_decode(ctx, batch_view);
if (ret != 0) {
- LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
+ LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
return false;
}
}
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
- LOG_TEE("%s: llama_decode() failed\n", __func__);
+ LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1;
}
}
if (!params.batched_bench_output_jsonl) {
- LOG_TEE("\n");
- LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
- LOG_TEE("\n");
- LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
- LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
+ LOG("\n");
+ LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
+ LOG("\n");
+ LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
+ LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
}
for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
llama_kv_cache_clear(ctx);
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
- LOG_TEE("%s: llama_decode() failed\n", __func__);
+ LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1;
}
}
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
- LOG_TEE("%s: llama_decode() failed\n", __func__);
+ LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1;
}
}
const float speed = n_kv / t;
if(params.batched_bench_output_jsonl) {
- LOG_TEE(
+ LOG(
"{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
"\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
);
} else {
- LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
+ LOG("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
}
}
}
}
- LOG_TEE("\n");
+ LOG("\n");
llama_perf_context_print(ctx);
llama_batch_free(batch);
llama_backend_free();
- fprintf(stderr, "\n\n");
+ LOG("\n\n");
return 0;
}
#include "arg.h"
#include "common.h"
+#include "log.h"
#include "llama.h"
#include <algorithm>
#include <vector>
static void print_usage(int, char ** argv) {
- LOG_TEE("\nexample usage:\n");
- LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
- LOG_TEE("\n");
+ LOG("\nexample usage:\n");
+ LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
+ LOG("\n");
}
int main(int argc, char ** argv) {
return 1;
}
+ gpt_init();
// number of parallel batches
int n_parallel = params.n_parallel;
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
if (model == NULL) {
- fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+ LOG_ERR("%s: error: unable to load model\n" , __func__);
return 1;
}
llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
if (ctx == NULL) {
- fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+ LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
return 1;
}
const int n_ctx = llama_n_ctx(ctx);
- LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
+ LOG_INF("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
// make sure the KV cache is big enough to hold all the prompt and generated tokens
if (n_kv_req > n_ctx) {
- LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
- LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__);
+ LOG_ERR("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
+ LOG_ERR("%s: either reduce n_parallel or increase n_ctx\n", __func__);
return 1;
}
// print the prompt token-by-token
- fprintf(stderr, "\n");
+ LOG("\n");
for (auto id : tokens_list) {
- fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
+ LOG("%s", llama_token_to_piece(ctx, id).c_str());
}
- fflush(stderr);
-
// create a llama_batch
// we use this object to submit token data for decoding
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
if (llama_model_has_encoder(model)) {
if (llama_encode(ctx, batch)) {
- LOG_TEE("%s : failed to eval\n", __func__);
+ LOG_ERR("%s : failed to eval\n", __func__);
return 1;
}
batch.logits[batch.n_tokens - 1] = true;
if (llama_decode(ctx, batch) != 0) {
- LOG_TEE("%s: llama_decode() failed\n", __func__);
+ LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1;
}
//}
if (n_parallel > 1) {
- LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
+ LOG("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
}
// main loop
// is it an end of generation? -> mark the stream as finished
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
i_batch[i] = -1;
- LOG_TEE("\n");
+ LOG("\n");
if (n_parallel > 1) {
- LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
+ LOG_INF("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
}
continue;
// if there is only one stream, we print immediately to stdout
if (n_parallel == 1) {
- LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
- fflush(stdout);
+ LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
}
streams[i] += llama_token_to_piece(ctx, new_token_id);
// evaluate the current batch with the transformer model
if (llama_decode(ctx, batch)) {
- fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
+ LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
return 1;
}
}
- LOG_TEE("\n");
-
if (n_parallel > 1) {
- LOG_TEE("\n");
+ LOG("\n");
for (int32_t i = 0; i < n_parallel; ++i) {
- LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
+ LOG("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
}
}
const auto t_main_end = ggml_time_us();
- LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
+ LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
- LOG_TEE("\n");
+ LOG("\n");
llama_perf_sampler_print(smpl);
llama_perf_context_print(ctx);
#include <climits>
#include <cstring>
#include <cstdarg>
+#include <cinttypes>
#include <ctime>
#include <random>
#include <stdexcept>
const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
try {
w->token_embedding_table.resize(p->vocab_size * p->dim);
- LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+ LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
w->rms_att_weight.resize(p->n_layers * p->dim);
- LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
+ LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
w->rms_ffn_weight.resize(p->n_layers * p->dim);
- LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
+ LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
w->wq.resize(p->n_layers * p->dim * p->dim);
- LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+ LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
- LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
+ LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
- LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
+ LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
w->wo.resize(p->n_layers * p->dim * p->dim);
- LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+ LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
- LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+ LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
- LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
+ LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
- LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+ LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
w->rms_final_weight.resize(p->dim);
- LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
+ LOG_INF("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
if (shared_weights) {
w->wcls = {};
} else {
w->wcls.resize(p->vocab_size * p->dim);
- LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+ LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
}
}
catch (std::length_error &) {
fseek(f, 0, SEEK_END);
auto end = ftell(f);
if (curr != end) {
- LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end);
+ LOG_ERR("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end);
return 1;
}
}
static void print_sample_weights(TransformerWeights *w){
- LOG("----- Quick print of first of the weight vales of all the variables\n");
- LOG("%f\n", w->token_embedding_table[0]);
- LOG("%f\n", w->rms_att_weight[0]);
- LOG("%f\n", w->rms_ffn_weight[0]);
-
- LOG("%f\n", w->wq[0]);
- LOG("%f\n", w->wk[0]);
- LOG("%f\n", w->wv[0]);
- LOG("%f\n", w->wo[0]);
- LOG("%f\n", w->w1[0]);
- LOG("%f\n", w->w2[0]);
- LOG("%f\n", w->w3[0]);
- LOG("%f\n", w->rms_att_weight[0]);
- if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]);
+ LOG_INF("----- Quick print of first of the weight vales of all the variables\n");
+ LOG_INF("%f\n", w->token_embedding_table[0]);
+ LOG_INF("%f\n", w->rms_att_weight[0]);
+ LOG_INF("%f\n", w->rms_ffn_weight[0]);
+
+ LOG_INF("%f\n", w->wq[0]);
+ LOG_INF("%f\n", w->wk[0]);
+ LOG_INF("%f\n", w->wv[0]);
+ LOG_INF("%f\n", w->wo[0]);
+ LOG_INF("%f\n", w->w1[0]);
+ LOG_INF("%f\n", w->w2[0]);
+ LOG_INF("%f\n", w->w3[0]);
+ LOG_INF("%f\n", w->rms_att_weight[0]);
+ if (!w->wcls.empty()) LOG_INF("%f\n", w->wcls[0]);
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////
};
static void print_params(struct my_llama_hparams * params) {
- LOG("%s: n_vocab: %u\n", __func__, params->n_vocab);
- LOG("%s: n_ctx: %u\n", __func__, params->n_ctx);
- LOG("%s: n_embd: %u\n", __func__, params->n_embd);
- LOG("%s: n_mult: %u\n", __func__, params->n_mult);
- LOG("%s: n_head: %u\n", __func__, params->n_head);
- LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
- LOG("%s: n_ff: %u\n", __func__, params->n_ff);
- LOG("%s: n_layer: %u\n", __func__, params->n_layer);
- LOG("%s: n_rot: %u\n", __func__, params->n_rot);
+ LOG_INF("%s: n_vocab: %u\n", __func__, params->n_vocab);
+ LOG_INF("%s: n_ctx: %u\n", __func__, params->n_ctx);
+ LOG_INF("%s: n_embd: %u\n", __func__, params->n_embd);
+ LOG_INF("%s: n_mult: %u\n", __func__, params->n_mult);
+ LOG_INF("%s: n_head: %u\n", __func__, params->n_head);
+ LOG_INF("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
+ LOG_INF("%s: n_ff: %u\n", __func__, params->n_ff);
+ LOG_INF("%s: n_layer: %u\n", __func__, params->n_layer);
+ LOG_INF("%s: n_rot: %u\n", __func__, params->n_rot);
}
static void print_tensor_info(const struct ggml_context * ctx) {
for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
- LOG("%s: Allocating ", __func__);
+ LOG_INF("%s: Allocating ", __func__);
int64_t total = 1;
int i = 0;
for (; i < ggml_n_dims(t); ++i) {
static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) {
if (is_ggml_file(filename)) {
- LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
+ LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
struct ggml_context * ctx_data = NULL;
struct gguf_init_params params = {
gguf_free(ctx);
} else {
// assume llama2.c vocabulary
- LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
+ LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
llama_file file(filename, "rb");
if (!file.fp) {
die_fmt("%s: %s", strerror(errno), filename);
}
int main(int argc, char ** argv) {
+ gpt_init();
+
struct train_params params = get_default_train_params();
if (!params_parse(argc, argv, ¶ms)) {
return 1;
}
- log_set_target(stdout);
+
Config config;
TransformerWeights weights = {};
{
- LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
+ LOG_INF("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
FILE * file = fopen(params.fn_llama2c_model, "rb");
if (!file) {
- LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
+ LOG_ERR("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
return 1;
}
// read in the config header
if (fread(&config, sizeof(Config), 1, file) != 1) {
- LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
+ LOG_ERR("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
return 1;
}
auto shared_weights = config.vocab_size > 0;
// read in the Transformer weights
alloc_weights(&weights, &config, shared_weights);
if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
- LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
+ LOG_ERR("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
return 1;
}
fclose(file);
model.name = basename(params.fn_llama2c_model);
save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
- LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
+ LOG_INF("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
ggml_free(model.ctx);
return 0;
#include "ggml-metal.h"
#endif
+#include <algorithm>
+#include <climits>
#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <iostream>
#include <string>
#include <tuple>
#include <vector>
-#include <algorithm>
-#include <iostream>
-#include <fstream>
-#include <climits>
//////////////////////////////////////////////////
#include "arg.h"
#include "common.h"
+#include "log.h"
#include "llama.h"
#include <ctime>
llama_kv_cache_clear(ctx);
// run model
- fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
+ LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
// encoder-only model
if (llama_encode(ctx, batch) < 0) {
- fprintf(stderr, "%s : failed to encode\n", __func__);
+ LOG_ERR("%s : failed to encode\n", __func__);
}
} else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
// decoder-only model
if (llama_decode(ctx, batch) < 0) {
- fprintf(stderr, "%s : failed to decode\n", __func__);
+ LOG_ERR("%s : failed to decode\n", __func__);
}
}
return 1;
}
+ gpt_init();
+
params.embedding = true;
// For non-causal models, batch size must be equal to ubatch size
params.n_ubatch = params.n_batch;
- print_build_info();
-
llama_backend_init();
llama_numa_init(params.numa);
llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
if (model == NULL) {
- fprintf(stderr, "%s: error: unable to load model\n", __func__);
+ LOG_ERR("%s: unable to load model\n", __func__);
return 1;
}
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
- fprintf(stderr, "%s: error: computing embeddings in encoder-decoder models is not supported\n", __func__);
+ LOG_ERR("%s: computing embeddings in encoder-decoder models is not supported\n", __func__);
return 1;
}
if (n_ctx > n_ctx_train) {
- fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
+ LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, n_ctx);
}
// print system information
{
- fprintf(stderr, "\n");
- fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+ LOG_INF("\n");
+ LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
}
// split the prompt into lines
for (const auto & prompt : prompts) {
auto inp = ::llama_tokenize(ctx, prompt, true, false);
if (inp.size() > n_batch) {
- fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
+ LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
__func__, (long long int) inp.size(), (long long int) n_batch);
return 1;
}
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
for (auto & inp : inputs) {
if (inp.empty() || inp.back() != llama_token_sep(model)) {
- fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__);
- fprintf(stderr, "%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
+ LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
+ LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
}
}
// tokenization stats
if (params.verbose_prompt) {
for (int i = 0; i < (int) inputs.size(); i++) {
- fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
- fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
+ LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
+ LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
for (int j = 0; j < (int) inputs[i].size(); j++) {
- fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
+ LOG("%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
}
- fprintf(stderr, "\n\n");
+ LOG("\n\n");
}
}
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
if (params.embd_out.empty()) {
- fprintf(stdout, "\n");
+ LOG("\n");
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
for (int j = 0; j < n_embd_count; j++) {
- fprintf(stdout, "embedding %d: ", j);
+ LOG("embedding %d: ", j);
for (int i = 0; i < std::min(3, n_embd); i++) {
if (params.embd_normalize == 0) {
- fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
+ LOG("%6.0f ", emb[j * n_embd + i]);
} else {
- fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
+ LOG("%9.6f ", emb[j * n_embd + i]);
}
}
- fprintf(stdout, " ... ");
+ LOG(" ... ");
for (int i = n_embd - 3; i < n_embd; i++) {
if (params.embd_normalize == 0) {
- fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
+ LOG("%6.0f ", emb[j * n_embd + i]);
} else {
- fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
+ LOG("%9.6f ", emb[j * n_embd + i]);
}
}
- fprintf(stdout, "\n");
+ LOG("\n");
}
} else {
// print the first part of the embeddings or for a single prompt, the full embedding
for (int j = 0; j < n_prompts; j++) {
- fprintf(stdout, "embedding %d: ", j);
+ LOG("embedding %d: ", j);
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
if (params.embd_normalize == 0) {
- fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
+ LOG("%6.0f ", emb[j * n_embd + i]);
} else {
- fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
+ LOG("%9.6f ", emb[j * n_embd + i]);
}
}
- fprintf(stdout, "\n");
+ LOG("\n");
}
// print cosine similarity matrix
if (n_prompts > 1) {
- fprintf(stdout, "\n");
- printf("cosine similarity matrix:\n\n");
+ LOG("\n");
+ LOG("cosine similarity matrix:\n\n");
for (int i = 0; i < n_prompts; i++) {
- fprintf(stdout, "%6.6s ", prompts[i].c_str());
+ LOG("%6.6s ", prompts[i].c_str());
}
- fprintf(stdout, "\n");
+ LOG("\n");
for (int i = 0; i < n_prompts; i++) {
for (int j = 0; j < n_prompts; j++) {
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
- fprintf(stdout, "%6.2f ", sim);
+ LOG("%6.2f ", sim);
}
- fprintf(stdout, "%1.10s", prompts[i].c_str());
- fprintf(stdout, "\n");
+ LOG("%1.10s", prompts[i].c_str());
+ LOG("\n");
}
}
}
if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
const bool notArray = params.embd_out != "array";
- fprintf(stdout, notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
+ LOG(notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
for (int j = 0;;) { // at least one iteration (one prompt)
- if (notArray) fprintf(stdout, " {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
- fprintf(stdout, "[");
+ if (notArray) LOG(" {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
+ LOG("[");
for (int i = 0;;) { // at least one iteration (n_embd > 0)
- fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
+ LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
i++;
- if (i < n_embd) fprintf(stdout, ","); else break;
+ if (i < n_embd) LOG(","); else break;
}
- fprintf(stdout, notArray ? "]\n }" : "]");
+ LOG(notArray ? "]\n }" : "]");
j++;
- if (j < n_embd_count) fprintf(stdout, notArray ? ",\n" : ","); else break;
+ if (j < n_embd_count) LOG(notArray ? ",\n" : ","); else break;
}
- fprintf(stdout, notArray ? "\n ]" : "]\n");
+ LOG(notArray ? "\n ]" : "]\n");
if (params.embd_out == "json+" && n_prompts > 1) {
- fprintf(stdout, ",\n \"cosineSimilarity\": [\n");
+ LOG(",\n \"cosineSimilarity\": [\n");
for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
- fprintf(stdout, " [");
+ LOG(" [");
for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
- fprintf(stdout, "%6.2f", sim);
+ LOG("%6.2f", sim);
j++;
- if (j < n_embd_count) fprintf(stdout, ", "); else break;
+ if (j < n_embd_count) LOG(", "); else break;
}
- fprintf(stdout, " ]");
+ LOG(" ]");
i++;
- if (i < n_embd_count) fprintf(stdout, ",\n"); else break;
+ if (i < n_embd_count) LOG(",\n"); else break;
}
- fprintf(stdout, "\n ]");
+ LOG("\n ]");
}
- if (notArray) fprintf(stdout, "\n}\n");
+ if (notArray) LOG("\n}\n");
}
- LOG_TEE("\n");
+ LOG("\n");
llama_perf_context_print(ctx);
// clean up
#include "arg.h"
#include "common.h"
+#include "log.h"
#include "llama.h"
#include "ggml.h"
#include <cstdio>
-#include <random>
#include <string>
-#include <tuple>
#include <vector>
/**
GGML_ASSERT(n > 0);
float sum = 0;
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
- printf(" [\n");
+ LOG(" [\n");
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
if (i2 == n && ne[2] > 2*n) {
- printf(" ..., \n");
+ LOG(" ..., \n");
i2 = ne[2] - n;
}
- printf(" [\n");
+ LOG(" [\n");
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
if (i1 == n && ne[1] > 2*n) {
- printf(" ..., \n");
+ LOG(" ..., \n");
i1 = ne[1] - n;
}
- printf(" [");
+ LOG(" [");
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
if (i0 == n && ne[0] > 2*n) {
- printf("..., ");
+ LOG("..., ");
i0 = ne[0] - n;
}
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
} else {
GGML_ABORT("fatal error");
}
- printf("%12.4f", v);
+ LOG("%12.4f", v);
sum += v;
- if (i0 < ne[0] - 1) printf(", ");
+ if (i0 < ne[0] - 1) LOG(", ");
}
- printf("],\n");
+ LOG("],\n");
}
- printf(" ],\n");
+ LOG(" ],\n");
}
- printf(" ]\n");
- printf(" sum = %f\n", sum);
+ LOG(" ]\n");
+ LOG(" sum = %f\n", sum);
}
}
snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
}
- printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
- t->name, ggml_type_name(t->type), ggml_op_desc(t),
- src0->name, ggml_ne_string(src0).c_str(),
- src1 ? src1_str : "",
- ggml_ne_string(t).c_str());
+ LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
+ t->name, ggml_type_name(t->type), ggml_op_desc(t),
+ src0->name, ggml_ne_string(src0).c_str(),
+ src1 ? src1_str : "",
+ ggml_ne_string(t).c_str());
// copy the data from the GPU memory if needed
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
- fprintf(stderr, "%s : failed to eval\n", __func__);
+ LOG_ERR("%s : failed to eval\n", __func__);
return false;
}
return 1;
}
- print_build_info();
+ gpt_init();
llama_backend_init();
llama_numa_init(params.numa);
llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
if (model == nullptr || ctx == nullptr) {
- fprintf(stderr, "%s : failed to init\n", __func__);
+ LOG_ERR("%s : failed to init\n", __func__);
return 1;
}
// print system information
{
- fprintf(stderr, "\n");
- fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+ LOG_INF("\n");
+ LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+ LOG_INF("\n");
}
bool OK = run(ctx, params);
return 1;
}
- LOG_TEE("\n");
+ LOG("\n");
llama_perf_context_print(ctx);
llama_free(ctx);
return 1;
}
- g_verbose = (params.verbosity == 1);
+ g_verbose = (params.verbosity > 1);
try {
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
ctx.run_merge();
return 1;
}
+ gpt_init();
+
llama_model_params mparams = llama_model_params_from_gpt_params(params);
llama_context_params cparams = llama_context_params_from_gpt_params(params);
#include "arg.h"
#include "common.h"
+#include "log.h"
#include "llama.h"
#include <cmath>
#endif
static void print_usage(int, char ** argv) {
- LOG_TEE("\nexample usage:\n");
- LOG_TEE("\n %s \\\n"
- " -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
+ LOG("\nexample usage:\n");
+ LOG("\n %s \\\n"
+ " -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \\\n"
" [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
" [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
- LOG_TEE("\n");
+ LOG("\n");
}
struct Stats {
e.counts.resize(src1->ne[0]*n_as, 0);
}
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
- fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
+ LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
exit(1); //GGML_ABORT("fatal error");
}
- if (m_params.verbosity > 1) {
- printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
- }
+ LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
// loop over all possible experts, regardless if they are used or not in the batch
for (int ex = 0; ex < n_as; ++ex) {
size_t e_start = ex*src1->ne[0];
e.values[e_start + j] += x[j]*x[j];
e.counts[e_start + j]++;
if (!std::isfinite(e.values[e_start + j])) {
- fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str());
+ LOG("\n");
+ LOG_ERR("%f detected in %s\n", e.values[e_start + j], wname.c_str());
exit(1);
}
}
e.counts.resize(src1->ne[0], 0);
}
else if (e.values.size() != (size_t)src1->ne[0]) {
- fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
+ LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
exit(1); //GGML_ABORT("fatal error");
}
++e.ncall;
- if (m_params.verbosity > 1) {
- printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
- }
+ LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
for (int row = 0; row < (int)src1->ne[1]; ++row) {
const float * x = data + row * src1->ne[0];
for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[j] += x[j]*x[j];
e.counts[j]++;
if (!std::isfinite(e.values[j])) {
- fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
+ LOG_ERR("%f detected in %s\n", e.values[j], wname.c_str());
exit(1);
}
}
}
if (n_zeros != 0 && is_first) {
- fprintf(stderr, "\n");
+ LOG_INF("\n");
is_first = false;
}
if (n_zeros == n_all) {
- fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
+ LOG_WRN("%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
continue;
}
if (n_zeros > 0) {
- fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
+ LOG_WRN("%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
continue;
}
}
if (to_store.size() < m_stats.size()) {
- fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
+ LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
}
std::ofstream out(fname, std::ios::binary);
out.write(m_params.prompt_file.c_str(), len);
}
- if (m_params.verbosity > 0) {
- fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
- }
+ LOGV(1, "\n");
+ LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
}
bool IMatrixCollector::load_imatrix(const char * fname) {
std::ifstream in(fname, std::ios::binary);
if (!in) {
- printf("%s: failed to open %s\n",__func__, fname);
+ LOG_ERR("%s: failed to open %s\n",__func__, fname);
return false;
}
int n_entries;
in.read((char*)&n_entries, sizeof(n_entries));
if (in.fail() || n_entries < 1) {
- printf("%s: no data in file %s\n", __func__, fname);
+ LOG_ERR("%s: no data in file %s\n", __func__, fname);
return false;
}
for (int i = 0; i < n_entries; ++i) {
std::vector<char> name_as_vec(len+1);
in.read((char *)name_as_vec.data(), len);
if (in.fail()) {
- printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
+ LOG_ERR("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
return false;
}
name_as_vec[len] = 0;
int nval;
in.read((char *)&nval, sizeof(nval));
if (in.fail() || nval < 1) {
- printf("%s: failed reading number of values for entry %d\n",__func__,i);
+ LOG_ERR("%s: failed reading number of values for entry %d\n",__func__,i);
m_stats = {};
return false;
}
std::vector<float> tmp(nval);
in.read((char*)tmp.data(), nval*sizeof(float));
if (in.fail()) {
- printf("%s: failed reading data for entry %d\n",__func__,i);
+ LOG_ERR("%s: failed reading data for entry %d\n",__func__,i);
m_stats = {};
return false;
}
const int n_ctx = llama_n_ctx(ctx);
auto tim1 = std::chrono::high_resolution_clock::now();
- fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
+ LOG_INF("%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
auto tim2 = std::chrono::high_resolution_clock::now();
- fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
+ LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
if (params.i_chunk > 0) {
if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
- fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
+ LOG_ERR("%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
return false;
}
- fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
+ LOG_INF("%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
}
if (int(tokens.size()) < 2*n_ctx) {
- fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
- n_ctx);
- fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
+ LOG_ERR("%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2*n_ctx, n_ctx);
+ LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n", __func__, tokens.size());
return false;
}
double nll = 0.0;
double nll2 = 0.0;
- fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
+ LOG_INF("%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
// TODO: use batch.logits to save computations instead of relying on logits_all == true
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
- fprintf(stderr, "%s : failed to eval\n", __func__);
+ LOG_ERR("%s : failed to eval\n", __func__);
return false;
}
if (i == 0) {
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
- fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+ LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
int total_seconds = (int)(t_total * n_chunk);
if (total_seconds >= 60*60) {
- fprintf(stderr, "%d hours ", total_seconds / (60*60));
+ LOG("%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60);
}
- fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
+ LOG("%.2f minutes\n", total_seconds / 60.0);
}
if (params.compute_ppl) {
const int first = n_ctx/2;
- const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
+ const auto * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
count += n_ctx - first - 1;
- printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+ LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
fflush(stdout);
logits.clear();
}
}
- printf("\n");
+ LOG("\n");
if (params.compute_ppl) {
nll2 /= count;
nll2 -= nll * nll;
if (nll2 > 0) {
nll2 = sqrt(nll2/(count-1));
- printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
+ LOG("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
} else {
- printf("Unexpected negative standard deviation of log(prob)\n");
+ LOG("Unexpected negative standard deviation of log(prob)\n");
}
}
params.n_ctx = 512;
params.logits_all = true;
- params.verbosity = 1;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
return 1;
}
+ gpt_init();
+
params.n_batch = std::min(params.n_batch, params.n_ctx);
g_collector.set_params(params);
for (const auto & in_file : params.in_files) {
- printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
+ LOG_INF("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
if (!g_collector.load_imatrix(in_file.c_str())) {
- fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str());
+ LOG_ERR("%s : failed to load %s\n", __func__, in_file.c_str());
return 1;
}
}
if (params.in_files.size() > 1) {
- printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
+ LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
g_collector.save_imatrix();
}
llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
if (model == nullptr || ctx == nullptr) {
- fprintf(stderr, "%s : failed to init\n", __func__);
+ LOG_ERR("%s : failed to init\n", __func__);
return 1;
}
const int n_ctx_train = llama_n_ctx_train(model);
if (params.n_ctx > n_ctx_train) {
- fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
+ LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, params.n_ctx);
}
// print system information
{
- fprintf(stderr, "\n");
- fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+ LOG_INF("\n");
+ LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
}
if (!compute_imatrix(ctx, params)) {
g_collector.save_imatrix();
- LOG_TEE("\n");
+ LOG("\n");
llama_perf_context_print(ctx);
llama_free(ctx);
#include "common.h"
#include "console.h"
#include "sampling.h"
+#include "log.h"
#include "llama.h"
#include <cassert>
const bool success = fs_create_directory_with_parents(params.logdir);
if (!success) {
- fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
+ LOG_ERR("%s: warning: failed to create logdir %s, cannot write logfile\n",
__func__, params.logdir.c_str());
return;
}
FILE * logfile = fopen(logfile_path.c_str(), "w");
if (logfile == NULL) {
- fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
+ LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
return;
}
is_interacting = true;
} else {
console::cleanup();
- printf("\n");
+ LOG("\n");
gpt_perf_print(*g_ctx, *g_smpl);
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
_exit(130);
return 1;
}
- auto & sparams = params.sparams;
+ gpt_init();
-#ifndef LOG_DISABLE_LOGS
- log_set_target(log_filename_generator("infill", "log"));
- LOG_TEE("Log start\n");
- log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
+ auto & sparams = params.sparams;
console::init(params.simple_io, params.use_color);
atexit([]() { console::cleanup(); });
if (params.logits_all) {
- printf("\n************\n");
- printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
- printf("************\n\n");
+ LOG_ERR("\n************\n");
+ LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
+ LOG_ERR("************\n\n");
return 0;
}
if (params.embedding) {
- printf("\n************\n");
- printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
- printf("************\n\n");
+ LOG_ERR("\n************\n");
+ LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
+ LOG_ERR("************\n\n");
return 0;
}
if (params.n_ctx != 0 && params.n_ctx < 8) {
- LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+ LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
}
+
if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
- printf("\n************\n");
- printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
- printf("************\n\n");
+ LOG_ERR("\n************\n");
+ LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
+ LOG_ERR("************\n\n");
return 0;
}
if (params.rope_freq_base != 0.0) {
- LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
+ LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
}
if (params.rope_freq_scale != 0.0) {
- LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
+ LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
}
- print_build_info();
-
- LOG("%s: llama backend init\n", __func__);
+ LOG_INF("%s: llama backend init\n", __func__);
llama_backend_init();
llama_numa_init(params.numa);
g_smpl = &smpl;
// load the model and apply lora adapter, if any
- LOG("%s: load the model and apply lora adapter, if any\n", __func__);
+ LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
llama_init_result llama_init = llama_init_from_gpt_params(params);
model = llama_init.model;
ctx = llama_init.context;
if (model == NULL) {
- LOG_TEE("%s: error: unable to load model\n", __func__);
+ LOG_ERR("%s: unable to load model\n", __func__);
return 1;
}
const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);
- LOG("n_ctx: %d\n", n_ctx);
+ LOG_DBG("n_ctx: %d\n", n_ctx);
if (n_ctx > n_ctx_train) {
- LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
- __func__, n_ctx_train, n_ctx);
+ LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
}
// print system information
{
- LOG_TEE("\n");
- LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
+ LOG_INF("\n");
+ LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
}
const bool add_bos = llama_add_bos_token(model);
GGML_ASSERT(!llama_add_eos_token(model));
- LOG("add_bos: %d\n", add_bos);
std::vector<llama_token> embd_inp;
std::vector<llama_token> embd_end;
embd_inp.push_back(middle_token);
}
- LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
- LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
- LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+ LOG_DBG("add_bos: %d\n", add_bos);
+ LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str());
+ LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str());
+ LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
// Should not run without any tokens
if (embd_inp.empty()) {
embd_inp.push_back(llama_token_bos(model));
- LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+ LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
}
if ((int) embd_inp.size() > n_ctx - 4) {
- LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+ LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1;
}
params.n_keep = (int)embd_inp.size();
}
- LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
- LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
-
+ LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str());
+ LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str());
// enable interactive mode if interactive start is specified
if (params.interactive_first) {
}
if (params.verbose_prompt) {
- LOG_TEE("\n");
- LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
- LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+ LOG_INF("\n");
+ LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+ LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
- LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+ LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
}
if (params.n_keep > 0) {
- LOG_TEE("%s: static prompt based on n_keep: '", __func__);
+ LOG_INF("%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) {
- LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+ LOG("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
}
- LOG_TEE("'\n");
+ LOG("'\n");
}
- LOG_TEE("\n");
+ LOG_INF("\n");
}
if (params.interactive) {
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
#endif
- LOG_TEE("%s: interactive mode on.\n", __func__);
+ LOG_INF("%s: interactive mode on.\n", __func__);
if (params.input_prefix_bos) {
- LOG_TEE("Input prefix with BOS\n");
+ LOG_INF("Input prefix with BOS\n");
}
if (!params.input_prefix.empty()) {
- LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
+ LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
}
if (!params.input_suffix.empty()) {
- LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
+ LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
}
}
smpl = gpt_sampler_init(model, sparams);
- LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl));
- LOG_TEE("sampling: \n%s\n", sparams.print().c_str());
- LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
- LOG_TEE("\n\n");
+ LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl));
+ LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
+ LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str());
+
+ LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
- LOG_TEE("\n##### Infill mode #####\n\n");
+ LOG("\n");
+ LOG("\n##### Infill mode #####\n\n");
if (params.interactive) {
const char *control_message;
if (params.multiline_input) {
" - To return control without starting a new line, end your input with '/'.\n"
" - If you want to submit another line, end your input with '\\'.\n";
}
- LOG_TEE("== Running in interactive mode. ==\n");
+ LOG("== Running in interactive mode. ==\n");
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
- LOG_TEE( " - Press Ctrl+C to interject at any time.\n");
+ LOG( " - Press Ctrl+C to interject at any time.\n");
#endif
- LOG_TEE( "%s\n", control_message);
+ LOG( "%s\n", control_message);
is_interacting = params.interactive_first;
}
embd.resize(max_embd_size);
console::set_display(console::error);
- printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+ LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
console::set_display(console::reset);
- fflush(stdout);
}
// infinite text generation via context swapping
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() > n_ctx) {
if (params.n_predict == -2) {
- LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
+ LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
break;
}
const int n_left = n_past - params.n_keep - 1;
const int n_discard = n_left/2;
- LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
+ LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, n_ctx, params.n_keep, n_discard);
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
n_past -= n_discard;
- LOG("after swap: n_past = %d\n", n_past);
+ LOG_DBG("after swap: n_past = %d\n", n_past);
- LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
+ LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
}
n_eval = params.n_batch;
}
- LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
+ LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
- LOG_TEE("%s : failed to eval\n", __func__);
+ LOG_ERR("%s : failed to eval\n", __func__);
return 1;
}
n_past += n_eval;
- LOG("n_past = %d\n", n_past);
+ LOG_DBG("n_past = %d\n", n_past);
}
}
gpt_sampler_accept(smpl, id, true);
- // LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str());
+ // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
embd.push_back(id);
// decrement remaining sampling budget
--n_remain;
- LOG("n_remain: %d\n", n_remain);
+ LOG_DBG("n_remain: %d\n", n_remain);
} else {
// some user input remains from prompt or interaction, forward it to processing
- LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
+ LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[n_consumed]);
if (input_echo) {
for (auto id : embd) {
const std::string token_str = llama_token_to_piece(ctx, id);
- printf("%s", token_str.c_str());
+ LOG("%s", token_str.c_str());
if (embd.size() > 1) {
input_tokens.push_back(id);
output_ss << token_str;
}
}
- fflush(stdout);
}
// reset color to default if we there is no pending user input
if (input_echo && (int) embd_inp.size() == n_consumed) {
if ((gpt_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
if (is_interacting && !params.interactive_first) {
// print an eot token
- printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
+ LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
}
- fflush(stdout);
- printf("\n");
+ LOG("\n");
console::set_display(console::user_input);
std::string buffer;
std::string line;
n_remain = params.n_predict;
n_past = 0;
n_consumed = 0;
- // LOG_TEE("took new input\n");
is_interacting = false;
}
// deal with end of generation tokens in interactive mode
else if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
- LOG("found EOS token\n");
+ LOG_DBG("found EOS token\n");
if (params.interactive) {
is_interacting = true;
- printf("\n");
+ LOG("\n");
console::set_display(console::user_input);
- fflush(stdout);
}
}
if (n_past > 0 && is_interacting && !params.interactive) {
- LOG("waiting for user input\n");
+ LOG_DBG("waiting for user input\n");
if (params.input_prefix_bos) {
- LOG("adding input prefix BOS token\n");
+ LOG_DBG("adding input prefix BOS token\n");
embd_inp.push_back(llama_token_bos(model));
}
std::string buffer;
if (!params.input_prefix.empty()) {
- LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
+ LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
buffer += params.input_prefix;
- printf("%s", buffer.c_str());
+ LOG("%s", buffer.c_str());
}
std::string line;
if (buffer.length() > 1) {
// append input suffix if any
if (!params.input_suffix.empty()) {
- LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
+ LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
buffer += params.input_suffix;
- printf("%s", params.input_suffix.c_str());
+ LOG("%s", params.input_suffix.c_str());
}
- LOG("buffer: '%s'\n", buffer.c_str());
+ LOG_DBG("buffer: '%s'\n", buffer.c_str());
const size_t original_size = embd_inp.size();
const auto line_inp = ::llama_tokenize(ctx, buffer, false);
- LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
+ LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
}
n_remain -= line_inp.size();
- LOG("n_remain: %d\n", n_remain);
+ LOG_DBG("n_remain: %d\n", n_remain);
} else {
- LOG("empty line, passing control back\n");
+ LOG_DBG("empty line, passing control back\n");
}
input_echo = false; // do not echo this again
}
}
if (!params.interactive && n_remain <= 0) {
- printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
- fflush(stdout);
+ LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
}
- LOG_TEE("\n");
+ LOG("\n");
gpt_perf_print(ctx, smpl);
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
gpt_sampler_free(smpl);
llama_backend_free();
-#ifndef LOG_DISABLE_LOGS
- LOG_TEE("Log end\n");
-#endif // LOG_DISABLE_LOGS
-
return 0;
}
// I'll gradually clean and extend it
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
#include "clip.h"
-#include "log.h"
#include "ggml.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include <cinttypes>
#include <limits>
+#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+
//#define CLIP_DEBUG_FUNCTIONS
// RGB uint8 image
static int get_key_idx(const gguf_context * ctx, const char * key) {
int i = gguf_find_key(ctx, key);
if (i == -1) {
- LOG_TEE("key %s not found in file\n", key);
+ LOG_ERR("key %s not found in file\n", key);
throw std::runtime_error(format("Missing required key: %s", key));
}
static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
size_t tensor_size = ggml_nbytes(tensor);
- LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
+ LOG_INF("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
}
static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
std::ofstream file(filename, std::ios::binary);
if (!file.is_open()) {
- LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
+ LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
return;
}
static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
std::ofstream file(filename, std::ios::binary);
if (!file.is_open()) {
- LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
+ LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
return;
}
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
if (!ctx->has_vision_encoder) {
- LOG_TEE("This gguf file seems to have no vision encoder\n");
+ LOG_ERR("This gguf file seems to have no vision encoder\n");
return nullptr;
}
if (load_image_size == nullptr) {
load_image_size = clip_image_size_init();
}
- LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
+ LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
image_size_width = load_image_size->width;
image_size_height = load_image_size->height;
if (is_inf) {
const int idx_name = gguf_find_key(ctx, KEY_NAME);
if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
const std::string name = gguf_get_val_str(ctx, idx_name);
- LOG_TEE("%s: model name: %s\n", __func__, name.c_str());
+ LOG_INF("%s: model name: %s\n", __func__, name.c_str());
}
- LOG_TEE("%s: description: %s\n", __func__, description.c_str());
- LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
- LOG_TEE("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
- LOG_TEE("%s: n_tensors: %d\n", __func__, n_tensors);
- LOG_TEE("%s: n_kv: %d\n", __func__, n_kv);
- LOG_TEE("%s: ftype: %s\n", __func__, ftype_str.c_str());
- LOG_TEE("\n");
+ LOG_INF("%s: description: %s\n", __func__, description.c_str());
+ LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
+ LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
+ LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);
+ LOG_INF("%s: n_kv: %d\n", __func__, n_kv);
+ LOG_INF("%s: ftype: %s\n", __func__, ftype_str.c_str());
+ LOG_INF("\n");
}
const int n_tensors = gguf_get_n_tensors(ctx);
// kv
const int n_kv = gguf_get_n_kv(ctx);
- LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
+ LOG_INF("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
__func__, n_kv, n_tensors, fname);
{
std::map<enum ggml_type, uint32_t> n_type;
n_type[type]++;
}
- LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
+ LOG_INF("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
for (int i = 0; i < n_kv; i++) {
const char * name = gguf_get_key(ctx, i);
const enum gguf_type type = gguf_get_kv_type(ctx, i);
}
replace_all(value, "\n", "\\n");
- LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
+ LOG_INF("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
}
// print type counts
continue;
}
- LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
+ LOG_INF("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
}
}
size_t tensor_size = ggml_nbytes(cur);
model_size += tensor_size;
if (verbosity >= 3) {
- LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
+ LOG_INF("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
__func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
}
}
#ifdef GGML_USE_CUDA
new_clip->backend = ggml_backend_cuda_init(0);
- LOG_TEE("%s: CLIP using CUDA backend\n", __func__);
+ LOG_INF("%s: CLIP using CUDA backend\n", __func__);
#endif
#ifdef GGML_USE_METAL
new_clip->backend = ggml_backend_metal_init();
- LOG_TEE("%s: CLIP using Metal backend\n", __func__);
+ LOG_INF("%s: CLIP using Metal backend\n", __func__);
#endif
#ifdef GGML_USE_CANN
new_clip->backend = ggml_backend_cann_init(0);
- LOG_TEE("%s: CLIP using CANN backend\n", __func__);
+ LOG_INF("%s: CLIP using CANN backend\n", __func__);
#endif
#ifdef GGML_USE_VULKAN
new_clip->backend = ggml_backend_vk_init(0);
- LOG_TEE("%s: CLIP using Vulkan backend\n", __func__);
+ LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
#endif
if (!new_clip->backend) {
new_clip->backend = ggml_backend_cpu_init();
- LOG_TEE("%s: CLIP using CPU backend\n", __func__);
+ LOG_INF("%s: CLIP using CPU backend\n", __func__);
}
// model size and capabilities
new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
if (verbosity >= 1) {
- LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
- LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
- LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
- LOG_TEE("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
- LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
- LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
+ LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
+ LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
+ LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
+ LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
+ LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
+ LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
}
}
- LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
+ LOG_INF("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
// load tensors
{
new_clip->ctx_data = ggml_init(params);
if (!new_clip->ctx_data) {
- LOG_TEE("%s: ggml_init() failed\n", __func__);
+ LOG_ERR("%s: ggml_init() failed\n", __func__);
clip_free(new_clip);
gguf_free(ctx);
return nullptr;
auto fin = std::ifstream(fname, std::ios::binary);
if (!fin) {
- LOG_TEE("cannot open model file for loading tensors\n");
+ LOG_ERR("cannot open model file for loading tensors\n");
clip_free(new_clip);
gguf_free(ctx);
return nullptr;
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
fin.seekg(offset, std::ios::beg);
if (!fin) {
- LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name);
+ LOG_ERR("%s: failed to seek for tensor %s\n", __func__, name);
clip_free(new_clip);
gguf_free(ctx);
return nullptr;
}
if (verbosity >= 2) {
- LOG_TEE("\n%s: vision model hparams\n", __func__);
- LOG_TEE("image_size %d\n", hparams.image_size);
- LOG_TEE("patch_size %d\n", hparams.patch_size);
- LOG_TEE("v_hidden_size %d\n", hparams.hidden_size);
- LOG_TEE("v_n_intermediate %d\n", hparams.n_intermediate);
- LOG_TEE("v_projection_dim %d\n", hparams.projection_dim);
- LOG_TEE("v_n_head %d\n", hparams.n_head);
- LOG_TEE("v_n_layer %d\n", hparams.n_layer);
- LOG_TEE("v_eps %f\n", hparams.eps);
- LOG_TEE("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
- LOG_TEE("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
- LOG_TEE("v_image_grid_pinpoints: ");
+ LOG_INF("\n%s: vision model hparams\n", __func__);
+ LOG_INF("image_size %d\n", hparams.image_size);
+ LOG_INF("patch_size %d\n", hparams.patch_size);
+ LOG_INF("v_hidden_size %d\n", hparams.hidden_size);
+ LOG_INF("v_n_intermediate %d\n", hparams.n_intermediate);
+ LOG_INF("v_projection_dim %d\n", hparams.projection_dim);
+ LOG_INF("v_n_head %d\n", hparams.n_head);
+ LOG_INF("v_n_layer %d\n", hparams.n_layer);
+ LOG_INF("v_eps %f\n", hparams.eps);
+ LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
+ LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
+ LOG_INF("v_image_grid_pinpoints: ");
for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
- LOG_TEE("%d ", hparams.image_grid_pinpoints[i]);
+ LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
}
- LOG_TEE("\n");
- LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
+ LOG_INF("\n");
+ LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
}
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
} catch(const std::exception& /*e*/) {
- LOG_TEE("%s: failed to load vision model tensors\n", __func__);
+ LOG_ERR("%s: failed to load vision model tensors\n", __func__);
}
// LLaVA projection
} catch (std::runtime_error & /*e*/) { }
try {
vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
- // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
+ // LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__);
} catch (std::runtime_error & /*e*/) { }
} else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
// MobileVLM projection
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
- LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
+ LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
}
return new_clip;
int nx, ny, nc;
auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
if (!data) {
- LOG_TEE("%s: failed to load image '%s'\n", __func__, fname);
+ LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
return false;
}
build_clip_img_from_data(data, nx, ny, img);
int nx, ny, nc;
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
if (!data) {
- LOG_TEE("%s: failed to decode image bytes\n", __func__);
+ LOG_ERR("%s: failed to decode image bytes\n", __func__);
return false;
}
build_clip_img_from_data(data, nx, ny, img);
int downscaled_height = static_cast<int>(original_height * scale);
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
int wasted_resolution = (width * height) - effective_resolution;
- // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+ // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
max_effective_resolution = effective_resolution;
min_wasted_resolution = wasted_resolution;
const int multiple = fmin(ceil(ratio), max_slice_nums);
std::vector<std::vector<clip_image_u8 *>> images;
- LOG_TEE("%s: multiple %d\n", __func__, multiple);
+ LOG_INF("%s: multiple %d\n", __func__, multiple);
images.push_back(std::vector<clip_image_u8 *>());
if (multiple <= 1) {
clip_image_u8 * source_image = clip_image_u8_init();
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
// source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
- LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
+ LOG_INF("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
images[images.size()-1].push_back(source_image);
std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
- LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
+ LOG_INF("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
clip_image_u8 * refine_image = clip_image_u8_init();
bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
- LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
+ LOG_INF("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
// split_to_patches
int width = refine_image->nx;
int idx = 0;
for (size_t i = 0; i < imgs.size(); ++i) {
for (size_t j = 0; j < imgs[i].size(); ++j) {
- LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
+ LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
clip_image_f32 * res = clip_image_f32_init();
normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
res_imgs->data[idx++] = *res;
bool pad_to_square = true;
if (!ctx->has_vision_encoder) {
- LOG_TEE("This gguf file seems to have no vision encoder\n");
+ LOG_ERR("This gguf file seems to have no vision encoder\n");
return false;
}
auto & params = ctx->vision_model.hparams;
}
for (size_t i = 0; i < patches.size(); i++) {
- // LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
+ // LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
clip_image_u8_free(patches[i]);
}
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
if (!ctx->has_vision_encoder) {
- LOG_TEE("This gguf file seems to have no vision encoder\n");
+ LOG_ERR("This gguf file seems to have no vision encoder\n");
return false;
}
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
if (!ctx->has_vision_encoder) {
- LOG_TEE("This gguf file seems to have no vision encoder\n");
+ LOG_ERR("This gguf file seems to have no vision encoder\n");
return false;
}
new_type = type;
if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
- // LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
+ // LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
}
const size_t n_elms = ggml_nelements(cur);
float * f32_data;
f32_data = (float *)conv_buf.data();
break;
default:
- LOG_TEE("Please use an input file in f32 or f16\n");
+ LOG_ERR("Please use an input file in f32 or f16\n");
gguf_free(ctx_out);
return false;
}
fout.put(0);
}
- LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
+ LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
}
gguf_free(ctx_out);
{
- LOG_TEE("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
- LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
+ LOG_INF("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
+ LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
}
return true;
#include <cstdio>
#include <cstdlib>
+#include <cstring>
#include <vector>
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
n_eval = n_batch;
}
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
- LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
+ LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
return false;
}
*n_past += n_eval;
size_t img_base64_str_start, img_base64_str_end;
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
- LOG_TEE("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
+ LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
return NULL;
}
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
if (!embed) {
- LOG_TEE("%s: could not load image from base64 string.\n", __func__);
+ LOG_ERR("%s: could not load image from base64 string.\n", __func__);
return NULL;
}
};
static void print_usage(int, char ** argv) {
- LOG_TEE("\n example usage:\n");
- LOG_TEE("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
- LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
+ LOG("\n example usage:\n");
+ LOG("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+ LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
}
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
auto prompt = params->prompt;
if (prompt_contains_image(prompt)) {
if (!params->image.empty()) {
- LOG_TEE("using base64 encoded image instead of command line image path\n");
+ LOG_INF("using base64 encoded image instead of command line image path\n");
}
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
if (!embed) {
- LOG_TEE("%s: can't load image from prompt\n", __func__);
+ LOG_ERR("%s: can't load image from prompt\n", __func__);
return NULL;
}
params->prompt = remove_image_from_prompt(prompt);
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
system_prompt = prompt.substr(0, image_pos);
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
- LOG_TEE("system_prompt: %s\n", system_prompt.c_str());
+ LOG_INF("system_prompt: %s\n", system_prompt.c_str());
if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
- LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+ LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
}
- LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
+ LOG_INF("user_prompt: %s\n", user_prompt.c_str());
if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
- LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+ LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
}
} else {
if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
- LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+ LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
}
}
}
// generate the response
- LOG_TEE("\n");
+ LOG("\n");
struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
if (!smpl) {
- fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
+ LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
exit(1);
}
response += tmp;
if (strcmp(tmp, "</s>") == 0) break;
if (strstr(tmp, "###")) break; // Yi-VL behavior
- printf("%s", tmp);
+ LOG("%s", tmp);
if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
}
gpt_sampler_free(smpl);
- printf("\n");
+ LOG("\n");
}
static struct llama_model * llava_init(gpt_params * params) {
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
if (model == NULL) {
- LOG_TEE("%s: error: unable to load model\n" , __func__);
+ LOG_ERR("%s: unable to load model\n" , __func__);
return NULL;
}
return model;
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
if (ctx_llama == NULL) {
- LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
+ LOG_ERR("%s: failed to create the llama_context\n" , __func__);
return NULL;
}
- auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
+ auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
ctx_llava->ctx_llama = ctx_llama;
ctx_llava->ctx_clip = ctx_clip;
llama_backend_free();
}
-static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
- (void) level;
- (void) user_data;
- LOG_TEE("%s", text);
-}
-
int main(int argc, char ** argv) {
ggml_time_init();
return 1;
}
-#ifndef LOG_DISABLE_LOGS
- log_set_target(log_filename_generator("llava", "log"));
- LOG_TEE("Log start\n");
- log_dump_cmdline(argc, argv);
- llama_log_set(llama_log_callback_logTee, nullptr);
-#endif // LOG_DISABLE_LOGS
+ gpt_init();
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
print_usage(argc, argv);
return 1;
}
- auto model = llava_init(¶ms);
+
+ auto * model = llava_init(¶ms);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
return 1;
}
if (prompt_contains_image(params.prompt)) {
- auto ctx_llava = llava_init_context(¶ms, model);
+ auto * ctx_llava = llava_init_context(¶ms, model);
- auto image_embed = load_image(ctx_llava, ¶ms, "");
+ auto * image_embed = load_image(ctx_llava, ¶ms, "");
// process the prompt
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
llava_free(ctx_llava);
} else {
for (auto & image : params.image) {
- auto ctx_llava = llava_init_context(¶ms, model);
+ auto * ctx_llava = llava_init_context(¶ms, model);
- auto image_embed = load_image(ctx_llava, ¶ms, image);
+ auto * image_embed = load_image(ctx_llava, ¶ms, image);
if (!image_embed) {
- std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
+ LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
return 1;
}
#include "clip.h"
-#include "common.h"
-#include "llama.h"
#include "llava.h"
-#include "base64.hpp"
+#include "llama.h"
+
+#include <algorithm>
+#include <cerrno>
#include <cstdio>
#include <cstdlib>
+#include <cstring>
+#include <limits>
#include <vector>
-#include <numeric>
+
+#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
+#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
+
+#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
// RGB uint8 image
struct clip_image_u8 {
int downscaled_height = static_cast<int>(original_height * scale);
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
int wasted_resolution = (width * height) - effective_resolution;
- // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+ // LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
max_effective_resolution = effective_resolution;
min_wasted_resolution = wasted_resolution;
img_res_v.size = 0;
img_res_v.data = nullptr;
if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
- LOG_TEE("%s: unable to preprocess image\n", __func__);
+ LOG_ERR("%s: unable to preprocess image\n", __func__);
delete[] img_res_v.data;
return false;
}
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
}
if (!encoded) {
- LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
+ LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
return false;
}
const int64_t t_img_enc_steop_batch_us = ggml_time_us();
- LOG_TEE("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
+ LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
}
const int64_t t_img_enc_batch_us = ggml_time_us();
- LOG_TEE("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+ LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
int n_img_pos_out = 0;
for (size_t i = 0; i < image_embd_v.size(); i++) {
load_image_size->width = img->nx;
load_image_size->height = img->ny;
clip_add_load_image_size(ctx_clip, load_image_size);
- LOG_TEE("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
+ LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
}
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
// flat / default llava-1.5 type embedding
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
delete[] img_res_v.data;
if (!encoded) {
- LOG_TEE("Unable to encode image\n");
+ LOG_ERR("Unable to encode image\n");
return false;
}
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
if (!encoded) {
- LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
+ LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
return false;
}
}
const int64_t t_img_enc_batch_us = ggml_time_us();
- LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+ LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
const int32_t * image_grid = clip_image_grid(ctx_clip);
// clip_image_save_to_bmp(*tmp, "image_feature.bmp");
}
- LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
+ LOG_INF("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
const int64_t t_img_enc_end_us = ggml_time_us();
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
- LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
+ LOG_INF("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
return true;
}
int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
if (n_image_embd != n_llama_embd) {
- LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
+ LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
return false;
}
return true;
}
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
if (!image_embd) {
- LOG_TEE("Unable to allocate memory for image embeddings\n");
+ LOG_ERR("Unable to allocate memory for image embeddings\n");
return false;
}
int n_img_pos;
if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
- LOG_TEE("%s: cannot encode image, aborting\n", __func__);
+ LOG_ERR("%s: cannot encode image, aborting\n", __func__);
free(image_embd);
return false;
}
}
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
if (llama_decode(ctx_llama, batch)) {
- LOG_TEE("%s : failed to eval\n", __func__);
+ LOG_ERR("%s : failed to eval\n", __func__);
return false;
}
*n_past += n_eval;
clip_image_u8 * img = clip_image_u8_init();
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
clip_image_u8_free(img);
- LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
+ LOG_ERR("%s: can't load image from bytes, is it a valid image?", __func__);
return NULL;
}
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
if (!image_embed_result) {
clip_image_u8_free(img);
- LOG_TEE("%s: coulnd't embed the image\n", __func__);
+ LOG_ERR("%s: coulnd't embed the image\n", __func__);
return NULL;
}
static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
auto file = fopen(path, "rb");
if (file == NULL) {
- LOG_TEE("%s: can't read file %s\n", __func__, path);
+ LOG_ERR("%s: can't read file %s\n", __func__, path);
return false;
}
auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
if (buffer == NULL) {
- LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
+ LOG_ERR("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
perror("Memory allocation error");
fclose(file);
return false;
long image_bytes_length;
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
if (!loaded) {
- LOG_TEE("%s: failed to load %s\n", __func__, image_path);
+ LOG_ERR("%s: failed to load %s\n", __func__, image_path);
return NULL;
}
#include "llama.h"
#include "ggml.h"
+#include <algorithm>
#include <cstdio>
#include <cstdlib>
+#include <cstring>
#include <vector>
+#include <iostream> // TODO: remove me
struct llava_context {
struct clip_ctx * ctx_clip = NULL;
};
static void show_additional_info(int /*argc*/, char ** argv) {
- LOG_TEE("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
- LOG_TEE("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
-}
-
-static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
- (void) level;
- (void) user_data;
- LOG_TEE("%s", text);
+ LOG("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+ LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
}
static struct llama_model * llava_init(gpt_params * params) {
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
if (model == NULL) {
- LOG_TEE("%s: error: unable to load model\n" , __func__);
+ LOG_ERR("%s: unable to load model\n" , __func__);
return NULL;
}
return model;
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
if (params->n_ctx < 2048) {
// warn user here, "Image processing requires at least 2048 context, setting context to 2048"
- LOG_TEE("%s: warn: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
+ LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
ctx_params.n_ctx = 2048;
} else {
ctx_params.n_ctx = params->n_ctx;
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
if (ctx_llama == NULL) {
- LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
+ LOG_ERR("%s: failed to create the llama_context\n" , __func__);
return NULL;
}
- auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
+ auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
ctx_llava->ctx_llama = ctx_llama;
ctx_llava->model = model;
if (prompt.empty()) {
prompt = "describe the image in detail.";
}
- auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
+ auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
return ctx_clip;
}
n_eval = n_batch;
}
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
- LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
+ LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
return false;
}
*n_past += n_eval;
float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
- auto slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
+ auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
slice_embed->embed = image_embed;
slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
else if (has_minicpmv_projector == 3) {
system_prompt = "<|im_start|>user\n";
}
- LOG_TEE("%s: image token past: %d\n", __func__, n_past);
+ LOG_INF("%s: image token past: %d\n", __func__, n_past);
eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
}
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
}
- LOG_TEE("%s: image token past: %d\n", __func__, n_past);
+ LOG_INF("%s: image token past: %d\n", __func__, n_past);
}
static const char * sample(struct gpt_sampler * smpl,
}
static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
- auto ctx_clip = clip_init_context(params);
- auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
+ auto * ctx_clip = clip_init_context(params);
+ auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
if (!embeds) {
- std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
+ LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str());
return NULL;
}
// process the prompt
if (params->prompt.empty() && params->interactive == false) {
- LOG_TEE("prompt should be given or interactive mode should be on");
+ LOG_ERR("prompt should be given or interactive mode should be on");
return NULL;
}
- auto model = llava_init(params);
+ auto * model = llava_init(params);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
return NULL;
}
const int64_t t_llava_init_start_us = ggml_time_us();
- auto ctx_llava = llava_init_context(params, model);
+ auto * ctx_llava = llava_init_context(params, model);
ctx_llava->ctx_clip = ctx_clip;
const int64_t t_llava_init_end_us = ggml_time_us();
float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
- LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
+ LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
const int64_t t_process_image_start_us = ggml_time_us();
process_image(ctx_llava, embeds, params, n_past);
const int64_t t_process_image_end_us = ggml_time_us();
float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
- LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
+ LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
llava_image_embed_free(embeds);
return ctx_llava;
}
-static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
+static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, const std::string & prompt, int & n_past, bool is_first = false){
std::string user_prompt = prompt;
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
if (!is_first) {
// generate the response
- LOG_TEE("\n");
+ LOG_INF("\n");
struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
return smpl;
return 1;
}
-#ifndef LOG_DISABLE_LOGS
- log_set_target(log_filename_generator("llava", "log"));
- LOG_TEE("Log start\n");
- log_dump_cmdline(argc, argv);
- llama_log_set(llama_log_callback_logTee, nullptr);
-#endif // LOG_DISABLE_LOGS
+ gpt_init();
if (params.mmproj.empty() || (params.image.empty())) {
show_additional_info(argc, argv);
for (auto & image : params.image) {
int n_past = 0;
- auto ctx_llava = minicpmv_init(¶ms, image, n_past);
+ auto * ctx_llava = minicpmv_init(¶ms, image, n_past);
if (!params.prompt.empty()) {
- LOG_TEE("<user>%s\n", params.prompt.c_str());
- LOG_TEE("<assistant>");
- auto smpl = llama_init(ctx_llava, ¶ms, params.prompt.c_str(), n_past, true);
+ LOG("<user>%s\n", params.prompt.c_str());
+ LOG("<assistant>");
+ auto * smpl = llama_init(ctx_llava, ¶ms, params.prompt, n_past, true);
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
- std::string response = "";
+ std::string response;
bool have_tmp = false;
for (int i = 0; i < max_tgt_len; i++) {
- auto tmp = llama_loop(ctx_llava, smpl, n_past);
+ const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
response += tmp;
if (strcmp(tmp, "</s>") == 0){
- if(!have_tmp)continue;
- else break;
+ if (!have_tmp) {
+ continue;
+ }
+ break;
}
if (strstr(tmp, "###")) break; // Yi-VL behavior
have_tmp = true;
gpt_sampler_free(smpl);
}else {
while (true) {
- LOG_TEE("<user>");
+ LOG("<user>");
std::string prompt;
std::getline(std::cin, prompt);
- LOG_TEE("<assistant>");
- auto smpl = llama_init(ctx_llava, ¶ms, prompt, n_past, true);
+ LOG("<assistant>");
+ auto * smpl = llama_init(ctx_llava, ¶ms, prompt, n_past, true);
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
- std::string response = "";
+ std::string response;
for (int i = 0; i < max_tgt_len; i++) {
- auto tmp = llama_loop(ctx_llava, smpl, n_past);
+ const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
response += tmp;
if (strcmp(tmp, "</s>") == 0) break;
if (strstr(tmp, "###")) break; // Yi-VL behavior
#include "arg.h"
#include "common.h"
#include "sampling.h"
+#include "log.h"
#include "llama.h"
#include <cstdio>
return 1;
}
+ gpt_init();
+
const int W = 15; // lookahead window
const int N = 5; // n-gram size
const int G = 15; // max verification n-grams
const bool dump_kv_cache = params.dump_kv_cache;
-#ifndef LOG_DISABLE_LOGS
- log_set_target(log_filename_generator("lookahead", "log"));
- LOG_TEE("Log start\n");
- log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
-
// init llama.cpp
llama_backend_init();
llama_numa_init(params.numa);
const int max_tokens_list_size = max_context_size - 4;
if ((int) inp.size() > max_tokens_list_size) {
- fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
+ LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
return 1;
}
- fprintf(stderr, "\n\n");
+ LOG("\n\n");
for (auto id : inp) {
- fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
+ LOG("%s", llama_token_to_piece(ctx, id).c_str());
}
fflush(stderr);
{
const std::string token_str = llama_token_to_piece(ctx, id);
- printf("%s", token_str.c_str());
+ LOG("%s", token_str.c_str());
fflush(stdout);
}
}
}
if (llama_decode(ctx, batch) != 0) {
- fprintf(stderr, "\n\n%s: error: llama_decode failed - increase KV cache size\n", __func__);
+ LOG_ERR("\n\n%s: llama_decode failed - increase KV cache size\n", __func__);
return 1;
}
const std::string token_str = llama_token_to_piece(ctx, id);
if (v == 0) {
- printf("%s", token_str.c_str());
+ LOG("%s", token_str.c_str());
} else {
// print light cyan
- printf("\033[0;96m%s\033[0m", token_str.c_str());
+ LOG("\033[0;96m%s\033[0m", token_str.c_str());
}
fflush(stdout);
// print known n-grams starting with token id (debug)
if (0 && v == 0) {
if (ngrams_observed.cnt[id] > 0) {
- printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
+ LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
}
for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
- printf(" - ngram %2d: ", i);
+ LOG(" - ngram %2d: ", i);
const int idx = id*(N - 1)*G + i*(N - 1);
for (int j = 0; j < N - 1; j++) {
const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
- printf("%s", token_str.c_str());
+ LOG("%s", token_str.c_str());
}
- printf("\n");
+ LOG("\n");
}
}
auto t_dec_end = ggml_time_us();
- LOG_TEE("\n\n");
+ LOG("\n\n");
- LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
- LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
+ LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
+ LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
- LOG_TEE("\n");
- LOG_TEE("W = %2d\n", W);
- LOG_TEE("N = %2d\n", N);
- LOG_TEE("G = %2d\n", G);
- LOG_TEE("\n");
- LOG_TEE("n_predict = %d\n", n_predict);
- LOG_TEE("n_accept = %d\n", n_accept);
+ LOG_INF("\n");
+ LOG_INF("W = %2d\n", W);
+ LOG_INF("N = %2d\n", N);
+ LOG_INF("G = %2d\n", G);
+ LOG_INF("\n");
+ LOG_INF("n_predict = %d\n", n_predict);
+ LOG_INF("n_accept = %d\n", n_accept);
- LOG_TEE("\n");
+ LOG_INF("\n");
gpt_perf_print(ctx, smpl);
gpt_sampler_free(smpl);
llama_backend_free();
- fprintf(stderr, "\n\n");
+ LOG("\n\n");
return 0;
}
#include "llama.h"
#include "ggml.h"
-#include <cmath>
#include <cstdint>
#include <cstdio>
+#include <cinttypes>
#include <fstream>
#include <string>
#include <vector>
-#include <unordered_map>
int main(int argc, char ** argv){
gpt_params params;
return 1;
}
+ gpt_init();
+
const int n_draft = params.n_draft;
// init llama.cpp
try {
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
} catch (std::ifstream::failure const &) {
- fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
+ LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
exit(1);
}
}
const int64_t eta_min = eta_ms / (60*1000);
const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000;
- LOG_TEE("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
+ LOG_INF("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
}
// After each chunk, update the dynamic ngram cache with the context ngram cache:
ngram_cache_context.clear();
}
- LOG_TEE("\n");
+ LOG("\n");
- LOG_TEE("\n");
- LOG_TEE("n_draft = %d\n", n_draft);
- LOG_TEE("n_predict = %d\n", n_input - n_input % n_ctx);
- LOG_TEE("n_drafted = %d\n", n_drafted);
- LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
- LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
+ LOG_INF("\n");
+ LOG_INF("n_draft = %d\n", n_draft);
+ LOG_INF("n_predict = %d\n", n_input - n_input % n_ctx);
+ LOG_INF("n_drafted = %d\n", n_drafted);
+ LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
+ LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
- LOG_TEE("n_accept = %d\n", n_accept);
- LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
+ LOG_INF("n_accept = %d\n", n_accept);
+ LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
llama_free(ctx);
llama_free_model(model);
llama_backend_free();
- fprintf(stderr, "\n\n");
+ LOG("\n\n");
return 0;
}
#include "common.h"
#include "ngram-cache.h"
#include "sampling.h"
+#include "log.h"
#include "llama.h"
#include <cstdint>
return 1;
}
+ gpt_init();
+
// max. number of additional tokens to draft if match is found
const int n_draft = params.n_draft;
const bool dump_kv_cache = params.dump_kv_cache;
-#ifndef LOG_DISABLE_LOGS
- log_set_target(log_filename_generator("lookup", "log"));
- LOG_TEE("Log start\n");
- log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
-
// init llama.cpp
llama_backend_init();
llama_numa_init(params.numa);
try {
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
} catch (std::ifstream::failure const &) {
- fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
+ LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
exit(1);
}
}
const int max_tokens_list_size = max_context_size - 4;
if ((int) inp.size() > max_tokens_list_size) {
- fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
+ LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
return 1;
}
- fprintf(stderr, "\n\n");
+ LOG("\n\n");
for (auto id : inp) {
- fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
+ LOG("%s", llama_token_to_piece(ctx, id).c_str());
}
fflush(stderr);
}
// print current draft sequence
- LOG("drafted %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, draft).c_str());
+ LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());
int i_dft = 0;
while (true) {
const std::string token_str = llama_token_to_piece(ctx, id);
if (!params.use_color) {
- printf("%s", token_str.c_str());
+ LOG("%s", token_str.c_str());
}
if (llama_token_is_eog(model, id)) {
// check if the target token matches the draft
if (i_dft < (int) draft.size() && id == draft[i_dft]) {
- LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
+ LOG_DBG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
++n_accept;
++n_past;
++i_dft;
if (params.use_color) {
// color accepted draft token
- printf("\033[34m%s\033[0m", token_str.c_str());
+ LOG("\033[34m%s\033[0m", token_str.c_str());
fflush(stdout);
}
continue;
}
if (params.use_color) {
- printf("%s", token_str.c_str());
+ LOG("%s", token_str.c_str());
}
fflush(stdout);
- LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
+ LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
draft.clear();
draft.push_back(id);
llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
- LOG_TEE("\n\n");
+ LOG("\n\n");
- LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
- LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
+ LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
+ LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
- LOG_TEE("\n");
- LOG_TEE("n_draft = %d\n", n_draft);
- LOG_TEE("n_predict = %d\n", n_predict);
- LOG_TEE("n_drafted = %d\n", n_drafted);
- LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
- LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
+ LOG_INF("\n");
+ LOG_INF("n_draft = %d\n", n_draft);
+ LOG_INF("n_predict = %d\n", n_predict);
+ LOG_INF("n_drafted = %d\n", n_drafted);
+ LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
+ LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
- LOG_TEE("n_accept = %d\n", n_accept);
- LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
+ LOG_INF("n_accept = %d\n", n_accept);
+ LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
- LOG_TEE("\ntarget:\n\n");
+ LOG_INF("\ntarget:\n\n");
gpt_perf_print(ctx, smpl);
gpt_sampler_free(smpl);
llama_backend_free();
- fprintf(stderr, "\n\n");
+ LOG("\n\n");
return 0;
}
#include "arg.h"
#include "common.h"
#include "console.h"
+#include "log.h"
#include "sampling.h"
#include "llama.h"
#include <cassert>
-#include <cinttypes>
-#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
static bool is_interacting = false;
static bool need_insert_eot = false;
-static void print_usage(int, char ** argv) {
- printf("\nexample usage:\n");
- printf("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
- printf("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
- printf("\n");
+static void print_usage(int argc, char ** argv) {
+ (void) argc;
+
+ LOG("\nexample usage:\n");
+ LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
+ LOG("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
+ LOG("\n");
}
static bool file_exists(const std::string & path) {
const bool success = fs_create_directory_with_parents(params.logdir);
if (!success) {
- fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
- __func__, params.logdir.c_str());
+ LOG_ERR("%s: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str());
return;
}
FILE * logfile = fopen(logfile_path.c_str(), "w");
if (logfile == NULL) {
- fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
+ LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
return;
}
need_insert_eot = true;
} else {
console::cleanup();
- printf("\n");
+ LOG("\n");
gpt_perf_print(*g_ctx, *g_smpl);
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
_exit(130);
}
#endif
-static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
- (void) level;
- (void) user_data;
- LOG_TEE("%s", text);
-}
-
-static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
+static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
llama_chat_msg new_msg{role, content};
auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
chat_msgs.push_back({role, content});
- LOG("formatted: %s\n", formatted.c_str());
+ LOG_DBG("formatted: '%s'\n", formatted.c_str());
return formatted;
}
return 1;
}
- auto & sparams = params.sparams;
-
-#ifndef LOG_DISABLE_LOGS
- log_set_target(log_filename_generator("main", "log"));
- LOG_TEE("Log start\n");
- log_dump_cmdline(argc, argv);
- llama_log_set(llama_log_callback_logTee, nullptr);
-#endif // LOG_DISABLE_LOGS
+ gpt_init();
- // TODO: Dump params ?
- //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
+ auto & sparams = params.sparams;
// save choice to use color for later
// (note for later: this is a slightly awkward choice)
atexit([]() { console::cleanup(); });
if (params.logits_all) {
- printf("\n************\n");
- printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
- printf("************\n\n");
+ LOG_ERR("************\n");
+ LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
+ LOG_ERR("************\n\n");
return 0;
}
if (params.embedding) {
- printf("\n************\n");
- printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
- printf("************\n\n");
+ LOG_ERR("************\n");
+ LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
+ LOG_ERR("************\n\n");
return 0;
}
if (params.n_ctx != 0 && params.n_ctx < 8) {
- LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+ LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
}
if (params.rope_freq_base != 0.0) {
- LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
+ LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
}
if (params.rope_freq_scale != 0.0) {
- LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
+ LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
}
- print_build_info();
+ LOG_INF("%s: llama backend init\n", __func__);
- LOG("%s: llama backend init\n", __func__);
llama_backend_init();
llama_numa_init(params.numa);
g_smpl = &smpl;
// load the model and apply lora adapter, if any
- LOG("%s: load the model and apply lora adapter, if any\n", __func__);
+ LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
llama_init_result llama_init = llama_init_from_gpt_params(params);
model = llama_init.model;
ctx = llama_init.context;
if (model == NULL) {
- LOG_TEE("%s: error: unable to load model\n", __func__);
+ LOG_ERR("%s: error: unable to load model\n", __func__);
return 1;
}
- LOG("%s: llama threadpool init = n_threads = %d\n",
- __func__,
- (int) params.cpuparams.n_threads
- );
+ LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
+
struct ggml_threadpool_params tpp_batch =
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
struct ggml_threadpool_params tpp =
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
threadpool_batch = ggml_threadpool_new(&tpp_batch);
if (!threadpool_batch) {
- LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
- exit(1);
+ LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
+ return 1;
}
// Start the non-batch threadpool in the paused state
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
if (!threadpool) {
- LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
- exit(1);
+ LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+ return 1;
}
llama_attach_threadpool(ctx, threadpool, threadpool_batch);
const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);
- LOG("n_ctx: %d\n", n_ctx);
if (n_ctx > n_ctx_train) {
- LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
- __func__, n_ctx_train, n_ctx);
+ LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
}
// print chat template example in conversation mode
if (params.conversation) {
if (params.enable_chat_template) {
- LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
+ LOG_INF("%s: chat template example:\n%s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
} else {
- LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
+ LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
}
}
// print system information
{
- LOG_TEE("\n");
- LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
+ LOG_INF("\n");
+ LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+ LOG_INF("\n");
}
std::string path_session = params.path_prompt_cache;
std::vector<llama_token> session_tokens;
if (!path_session.empty()) {
- LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
+ LOG_INF("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
if (!file_exists(path_session)) {
- LOG_TEE("%s: session file does not exist, will create.\n", __func__);
+ LOG_INF("%s: session file does not exist, will create.\n", __func__);
} else if (file_is_empty(path_session)) {
- LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__);
+ LOG_INF("%s: The session file is empty. A new session will be initialized.\n", __func__);
} else {
// The file exists and is not empty
session_tokens.resize(n_ctx);
size_t n_token_count_out = 0;
if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
- LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
+ LOG_ERR("%s: failed to load session file '%s'\n", __func__, path_session.c_str());
return 1;
}
session_tokens.resize(n_token_count_out);
- LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
+ LOG_INF("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
}
}
if (!llama_model_has_encoder(model)) {
GGML_ASSERT(!llama_add_eos_token(model));
}
- LOG("add_bos: %d\n", add_bos);
+
+ LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos);
std::vector<llama_token> embd_inp;
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
: params.prompt;
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
- LOG("tokenize the prompt\n");
+ LOG_DBG("tokenize the prompt\n");
embd_inp = ::llama_tokenize(ctx, prompt, true, true);
} else {
- LOG("use session tokens\n");
+ LOG_DBG("use session tokens\n");
embd_inp = session_tokens;
}
- LOG("prompt: \"%s\"\n", log_tostr(prompt));
- LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+ LOG_DBG("prompt: \"%s\"\n", prompt.c_str());
+ LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
}
// Should not run without any tokens
if (embd_inp.empty()) {
if (add_bos) {
embd_inp.push_back(llama_token_bos(model));
- LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+ LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
} else {
- LOG_TEE("error: input is empty\n");
+ LOG_ERR("input is empty\n");
return -1;
}
}
// Tokenize negative prompt
if ((int) embd_inp.size() > n_ctx - 4) {
- LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+ LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1;
}
n_matching_session_tokens++;
}
if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
- LOG_TEE("%s: using full prompt from session file\n", __func__);
+ LOG_INF("%s: using full prompt from session file\n", __func__);
} else if (n_matching_session_tokens >= embd_inp.size()) {
- LOG_TEE("%s: session file has exact match for prompt!\n", __func__);
+ LOG_INF("%s: session file has exact match for prompt!\n", __func__);
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
- LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
- __func__, n_matching_session_tokens, embd_inp.size());
+ LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
+ __func__, n_matching_session_tokens, embd_inp.size());
} else {
- LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
- __func__, n_matching_session_tokens, embd_inp.size());
+ LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n",
+ __func__, n_matching_session_tokens, embd_inp.size());
}
// remove any "future" tokens that we might have inherited from the previous session
llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
}
- LOGLN(
- "recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu",
- log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
+ LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
+ embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
// if we will use the cache for the full prompt without reaching the end of the cache, force
// reevaluation of the last token to recalculate the cached logits
if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
- LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1);
+ LOG_DBG("recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1);
session_tokens.resize(embd_inp.size() - 1);
}
}
if (params.verbose_prompt) {
- LOG_TEE("\n");
- LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
- LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+ LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+ LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
- LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+ LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
}
if (params.n_keep > add_bos) {
- LOG_TEE("%s: static prompt based on n_keep: '", __func__);
+ LOG_INF("%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) {
- LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+ LOG("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
}
- LOG_TEE("'\n");
+ LOG("'\n");
}
- LOG_TEE("\n");
+ LOG_INF("\n");
}
// ctrl+C handling
}
if (params.interactive) {
- LOG_TEE("%s: interactive mode on.\n", __func__);
+ LOG("%s: interactive mode on.\n", __func__);
if (!params.antiprompt.empty()) {
for (const auto & antiprompt : params.antiprompt) {
- LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
+ LOG("Reverse prompt: '%s'\n", antiprompt.c_str());
if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
for (int i = 0; i < (int) tmp.size(); i++) {
- LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+ LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
}
}
}
}
if (params.input_prefix_bos) {
- LOG_TEE("Input prefix with BOS\n");
+ LOG("Input prefix with BOS\n");
}
if (!params.input_prefix.empty()) {
- LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
+ LOG("Input prefix: '%s'\n", params.input_prefix.c_str());
if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
- LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+ LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
}
}
}
if (!params.input_suffix.empty()) {
- LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
+ LOG("Input suffix: '%s'\n", params.input_suffix.c_str());
if (params.verbose_prompt) {
auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
for (int i = 0; i < (int) tmp.size(); i++) {
- LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+ LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
}
}
}
smpl = gpt_sampler_init(model, sparams);
if (!smpl) {
- fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
- exit(1);
+ LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
+ return 1;
}
- LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl));
- LOG_TEE("sampling params: \n%s\n", sparams.print().c_str());
- LOG_TEE("sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str());
+ LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl));
+ LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
+ LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str());
- LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+ LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
// group-attention state
// number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
- LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
+ LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
}
- LOG_TEE("\n\n");
+ LOG("\n");
if (params.interactive) {
const char * control_message;
" - To return control without starting a new line, end your input with '/'.\n"
" - If you want to submit another line, end your input with '\\'.\n";
}
- LOG_TEE("== Running in interactive mode. ==\n");
+ LOG("== Running in interactive mode. ==\n");
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
- LOG_TEE( " - Press Ctrl+C to interject at any time.\n");
+ LOG( " - Press Ctrl+C to interject at any time.\n");
#endif
- LOG_TEE( "%s\n", control_message);
+ LOG( "%s\n", control_message);
is_interacting = params.interactive_first;
}
llama_token * enc_input_buf = embd_inp.data();
if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
- LOG_TEE("%s : failed to eval\n", __func__);
+ LOG_ERR("%s : failed to eval\n", __func__);
return 1;
}
embd.resize(max_embd_size);
console::set_display(console::error);
- printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+ LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
console::set_display(console::reset);
- fflush(stdout);
}
if (ga_n == 1) {
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() >= n_ctx) {
if (params.n_predict == -2) {
- LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
+ LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
break;
}
const int n_left = n_past - params.n_keep;
const int n_discard = n_left/2;
- LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
+ LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, n_ctx, params.n_keep, n_discard);
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
n_past -= n_discard;
- LOG("after swap: n_past = %d\n", n_past);
+ LOG_DBG("after swap: n_past = %d\n", n_past);
- LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
+ LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
- LOG("clear session path\n");
+ LOG_DBG("clear session path\n");
path_session.clear();
}
} else {
const int bd = (ga_w/ga_n)*(ga_n - 1);
const int dd = (ga_w/ga_n) - ib*bd - ga_w;
- LOG("\n");
- LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
- LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
- LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
+ LOG_DBG("\n");
+ LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
+ LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
+ LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd);
llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
ga_i += ga_w/ga_n;
- LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
+ LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
}
}
n_eval = params.n_batch;
}
- LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
+ LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
- LOG_TEE("%s : failed to eval\n", __func__);
+ LOG_ERR("%s : failed to eval\n", __func__);
return 1;
}
n_past += n_eval;
- LOG("n_past = %d\n", n_past);
+ LOG_DBG("n_past = %d\n", n_past);
// Display total tokens alongside total time
if (params.n_print > 0 && n_past % params.n_print == 0) {
- LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
+ LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
}
}
need_to_save_session = false;
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
- LOG("saved session to %s\n", path_session.c_str());
+ LOG_DBG("saved session to %s\n", path_session.c_str());
}
const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
- gpt_sampler_accept(smpl, id, /* apply_grammar= */ true);
+ gpt_sampler_accept(smpl, id, /* accept_grammar= */ true);
- // LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str());
+ // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
embd.push_back(id);
// decrement remaining sampling budget
--n_remain;
- LOG("n_remain: %d\n", n_remain);
+ LOG_DBG("n_remain: %d\n", n_remain);
} else {
// some user input remains from prompt or interaction, forward it to processing
- LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
+ LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[n_consumed]);
// push the prompt in the sampling context in order to apply repetition penalties later
// for the prompt, we don't apply grammar rules
- gpt_sampler_accept(smpl, embd_inp[n_consumed], /* apply_grammar= */ false);
+ gpt_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
++n_consumed;
if ((int) embd.size() >= params.n_batch) {
const std::string token_str = llama_token_to_piece(ctx, id, params.special);
// Console/Stream Output
- fprintf(stdout, "%s", token_str.c_str());
+ LOG("%s", token_str.c_str());
// Record Displayed Tokens To Log
// Note: Generated tokens are created one by one hence this check
output_tokens.push_back(id);
output_ss << token_str;
}
-
- fflush(stdout);
}
}
}
if (is_antiprompt) {
- LOG("found antiprompt: %s\n", last_output.c_str());
+ LOG_DBG("found antiprompt: %s\n", last_output.c_str());
}
}
// deal with end of generation tokens in interactive mode
if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
- LOG("found an EOG token\n");
+ LOG_DBG("found an EOG token\n");
if (params.interactive) {
if (!params.antiprompt.empty()) {
chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
}
is_interacting = true;
- printf("\n");
+ LOG("\n");
}
}
}
if (n_past > 0 && is_interacting) {
- LOG("waiting for user input\n");
+ LOG_DBG("waiting for user input\n");
if (params.conversation) {
- printf("\n> ");
+ LOG("\n> ");
}
if (params.input_prefix_bos) {
- LOG("adding input prefix BOS token\n");
+ LOG_DBG("adding input prefix BOS token\n");
embd_inp.push_back(llama_token_bos(model));
}
std::string buffer;
if (!params.input_prefix.empty() && !params.conversation) {
- LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
- printf("%s", params.input_prefix.c_str());
+ LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
+ LOG("%s", params.input_prefix.c_str());
}
// color user input only
if (buffer.length() > 1) {
// append input suffix if any
if (!params.input_suffix.empty() && !params.conversation) {
- LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
- printf("%s", params.input_suffix.c_str());
+ LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
+ LOG("%s", params.input_suffix.c_str());
}
- LOG("buffer: '%s'\n", buffer.c_str());
+ LOG_DBG("buffer: '%s'\n", buffer.c_str());
const size_t original_size = embd_inp.size();
const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat);
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
- LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
+ LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
// if user stop generation mid-way, we must add EOT to finish model's last response
if (need_insert_eot && format_chat) {
assistant_ss.str("");
n_remain -= line_inp.size();
- LOG("n_remain: %d\n", n_remain);
+ LOG_DBG("n_remain: %d\n", n_remain);
} else {
- LOG("empty line, passing control back\n");
+ LOG_DBG("empty line, passing control back\n");
}
input_echo = false; // do not echo this again
// end of generation
if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
- LOG_TEE(" [end of text]\n");
+ LOG(" [end of text]\n");
break;
}
}
if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
- LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
+ LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
}
- LOG_TEE("\n");
+ LOG("\n\n");
gpt_perf_print(ctx, smpl);
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
ggml_threadpool_free(threadpool);
ggml_threadpool_free(threadpool_batch);
-#ifndef LOG_DISABLE_LOGS
- LOG_TEE("Log end\n");
-#endif // LOG_DISABLE_LOGS
-
return 0;
}
#include "arg.h"
#include "common.h"
#include "sampling.h"
+#include "log.h"
#include "llama.h"
#include <cmath>
char buffer[80];
strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time);
- printf("\n\033[35mrun parameters as at %s\033[0m\n", buffer);
+ LOG_INF("\n");
+ LOG_INF("\033[35mrun parameters as of %s\033[0m\n", buffer);
+ LOG_INF("\n");
}
// Define a split string function to ...
return 1;
}
+ gpt_init();
+
// number of simultaneous "clients" to simulate
const int32_t n_clients = params.n_parallel;
const bool dump_kv_cache = params.dump_kv_cache;
-#ifndef LOG_DISABLE_LOGS
- log_set_target(log_filename_generator("parallel", "log"));
- LOG_TEE("Log start\n");
- log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
-
// init llama.cpp
llama_backend_init();
llama_numa_init(params.numa);
// load the prompts from an external file if there are any
if (params.prompt.empty()) {
- printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
+ LOG_INF("\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
} else {
// Output each line of the input params.prompts vector and copy to k_prompts
int index = 0;
- printf("\n\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
+ LOG_INF("\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
std::vector<std::string> prompts = split_string(params.prompt, '\n');
for (const auto& prompt : prompts) {
k_prompts.resize(index + 1);
k_prompts[index] = prompt;
index++;
- printf("%3d prompt: %s\n", index, prompt.c_str());
+ LOG_INF("%3d prompt: %s\n", index, prompt.c_str());
}
}
- fprintf(stderr, "\n\n");
- fflush(stderr);
+ LOG_INF("\n\n");
const int n_ctx = llama_n_ctx(ctx);
const auto t_main_start = ggml_time_us();
- LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
- LOG_TEE("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
- LOG_TEE("\n");
+ LOG_INF("%s: Simulating parallel requests from clients:\n", __func__);
+ LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
+ LOG_INF("\n");
{
- LOG_TEE("%s: Evaluating the system prompt ...\n", __func__);
+ LOG_INF("%s: Evaluating the system prompt ...\n", __func__);
for (int32_t i = 0; i < n_tokens_system; ++i) {
llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
}
if (llama_decode(ctx, batch) != 0) {
- LOG_TEE("%s: llama_decode() failed\n", __func__);
+ LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1;
}
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
}
- LOG_TEE("\n");
+ LOG_INF("\n");
}
- LOG_TEE("Processing requests ...\n\n");
+ LOG_INF("Processing requests ...\n\n");
while (true) {
if (dump_kv_cache) {
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
}
- LOG_TEE("%s: clearing the KV cache\n", __func__);
+ LOG_INF("%s: clearing the KV cache\n", __func__);
}
// insert new sequences for decoding
client.n_decoded = 0;
client.i_batch = batch.n_tokens - 1;
- LOG_TEE("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
+ LOG_INF("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
g_seq_id += 1;
if (ret != 0) {
if (n_batch == 1 || ret < 0) {
// if you get here, it means the KV cache is full - try increasing it via the context size
- LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
+ LOG_ERR("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
return 1;
}
- LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
+ LOG_ERR("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
n_cache_miss += 1;
continue;
}
- LOG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
+ LOG_DBG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
for (auto & client : clients) {
if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
const auto t_main_end = ggml_time_us();
- LOG_TEE("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \nInput: %s\n\033[35mResponse: %s\033[0m\n\n",
+ LOG_INF("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput: %s\n\033[35mResponse: %s\033[0m\n\n",
client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded,
(t_main_end - client.t_start_prompt) / 1e6,
(double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
print_date_time();
- LOG_TEE("\n%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
+ LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
if (params.prompt_file.empty()) {
params.prompt_file = "used built-in defaults";
}
- LOG_TEE("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
- LOG_TEE("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
+ LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
+ LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
- LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
- LOG_TEE("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
- LOG_TEE("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
- LOG_TEE("Cache misses: %6d\n", n_cache_miss);
+ LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
+ LOG_INF("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
+ LOG_INF("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
+ LOG_INF("Cache misses: %6d\n", n_cache_miss);
- LOG_TEE("\n");
+ LOG_INF("\n");
// TODO: print sampling/grammar timings for all clients
llama_perf_context_print(ctx);
llama_backend_free();
- fprintf(stderr, "\n\n");
+ LOG("\n\n");
return 0;
}
#include "arg.h"
#include "common.h"
+#include "log.h"
#include "llama.h"
#include <cmath>
#include <vector>
static void print_usage(int, char ** argv) {
- LOG_TEE("\nexample usage:\n");
- LOG_TEE("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
- LOG_TEE("\n");
+ LOG("\nexample usage:\n");
+ LOG("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
+ LOG("\n");
}
int main(int argc, char ** argv) {
return 1;
}
+ gpt_init();
+
int n_junk = params.n_junk;
int n_keep = params.n_keep;
int n_grp = params.grp_attn_n;
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
if (model == NULL) {
- fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+ LOG_ERR("%s: unable to load model\n" , __func__);
return 1;
}
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
if (ctx == NULL) {
- fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+ LOG_ERR("%s: failed to create the llama_context\n" , __func__);
return 1;
}
const int n_batch = ctx_params.n_batch;
const int n_batch_grp = ctx_params.n_batch/n_grp;
- LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
+ LOG_INF("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
// print the prompt token-by-token
- LOG_TEE("\n");
- LOG_TEE("prefix tokens: %d\n", n_tokens_prefix);
- LOG_TEE("prompt tokens: %d\n", n_tokens_all);
- //LOG_TEE("prompt: %s\n", params.prompt.c_str());
+ LOG_INF("\n");
+ LOG_INF("prefix tokens: %d\n", n_tokens_prefix);
+ LOG_INF("prompt tokens: %d\n", n_tokens_all);
+ //LOG_INF("prompt: %s\n", params.prompt.c_str());
llama_batch batch = llama_batch_init(params.n_batch, 0, 1);
}
if (llama_decode(ctx, batch) != 0) {
- LOG_TEE("%s: llama_decode() failed\n", __func__);
+ LOG_INF("%s: llama_decode() failed\n", __func__);
return 1;
}
- LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
+ LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
if (i + n_batch >= n_tokens_all) {
break;
for (int i = n_ctx; i < n_tokens_all; i += n_batch) {
const int n_discard = n_batch;
- LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard);
+ LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
}
if (llama_decode(ctx, batch) != 0) {
- LOG_TEE("%s: llama_decode() failed\n", __func__);
+ LOG_ERR("%s: llama_decode() failed\n", __func__);
return 1;
}
- LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
+ LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
}
{
const int n_discard = n_past - n_ctx + n_predict;
if (n_discard > 0) {
- LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
+ LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
}
}
- LOG_TEE("\n");
- LOG_TEE("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk);
- LOG_TEE("\n");
+ LOG_INF("\n");
+ LOG_INF("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk);
+ LOG_INF("\n");
// main loop
int n_cur = n_tokens_all;
int n_decode = 0;
- LOG_TEE("%s", prompt_suffix.c_str());
- fflush(stdout);
+ LOG_INF("%s", prompt_suffix.c_str());
const auto t_main_start = ggml_time_us();
// is it an end of generation?
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
- LOG_TEE("\n");
+ LOG("\n");
break;
}
- LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
- fflush(stdout);
+ LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
n_decode += 1;
// evaluate the current batch with the transformer model
if (llama_decode(ctx, batch)) {
- fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
+ LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
return 1;
}
}
- LOG_TEE("\n");
+ LOG("\n");
const auto t_main_end = ggml_time_us();
- LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
+ LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
- LOG_TEE("\n");
+ LOG("\n");
llama_perf_context_print(ctx);
- fprintf(stderr, "\n");
+ LOG("\n");
llama_sampler_free(smpl);
#include "arg.h"
#include "common.h"
+#include "log.h"
#include "llama.h"
+#include <algorithm>
#include <array>
#include <atomic>
#include <cmath>
}
if (params.hellaswag) {
- fprintf(stderr, "%s: warning: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
+ LOG_WRN("%s: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
return;
}
const bool success = fs_create_directory_with_parents(params.logdir);
if (!success) {
- fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
+ LOG_WRN("%s: failed to create logdir %s, cannot write logfile\n",
__func__, params.logdir.c_str());
return;
}
FILE * logfile = fopen(logfile_path.c_str(), "w");
if (logfile == NULL) {
- fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
+ LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
return;
}
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
- fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
+ LOG_INF("%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
const int n_ctx = llama_n_ctx(ctx);
if (int(tokens.size()) < 2*n_ctx) {
- fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
+ LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
n_ctx);
- fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
+ LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
return {std::move(tokens), 0., {}, {}};
}
prob_history.resize(tokens.size());
if (params.ppl_stride <= 0) {
- fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
+ LOG_ERR("%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
return {tokens, -1, logit_history, prob_history};
}
const int calc_chunk = n_ctx;
- fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
+ LOG_INF("%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
if (int(tokens.size()) <= calc_chunk) {
- fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
+ LOG_ERR("%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
tokens.size(), n_ctx, params.ppl_stride);
return {tokens, -1, logit_history, prob_history};
}
int count = 0;
double nll = 0.0;
- fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
+ LOG_INF("%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
for (int i = 0; i < n_chunk; ++i) {
const int start = i * params.ppl_stride;
const int end = start + calc_chunk;
const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
- //fprintf(stderr, "%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
+ //LOG_DBG("%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
std::vector<float> logits;
const int batch_start = start + j * n_batch;
const int batch_size = std::min(end - batch_start, n_batch);
- //fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
+ //LOG_DBG(" Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
// TODO: use llama_batch.logits instead of relying on logits_all == true
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
- //fprintf(stderr, "%s : failed to eval\n", __func__);
+ //LOG_ERR("%s : failed to eval\n", __func__);
return {tokens, -1, logit_history, prob_history};
}
if (i == 0) {
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
- fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+ LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
int total_seconds = (int)(t_total * n_chunk);
if (total_seconds >= 60*60) {
- fprintf(stderr, "%d hours ", total_seconds / (60*60));
+ LOG("%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60);
}
- fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
+ LOG("%.2f minutes\n", total_seconds / 60.0);
}
+ LOG("\n");
- //fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
+ //LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
// Calculate probability of next token, given the previous ones.
}
// perplexity is e^(average negative log-likelihood)
if (params.ppl_output_type == 0) {
- printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+ LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
} else {
- printf("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
+ LOG("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
}
- fflush(stdout);
}
- printf("\n");
+ LOG("\n");
return {tokens, std::exp(nll / count), logit_history, prob_history};
}
if (!params.logits_file.empty()) {
logits_stream.open(params.logits_file.c_str(), std::ios::binary);
if (!logits_stream.is_open()) {
- fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
+ LOG_ERR("%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
return {};
}
- fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
+ LOG_INF("%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
logits_stream.write("_logits_", 8);
logits_stream.write(reinterpret_cast<const char *>(&n_ctx), sizeof(n_ctx));
}
auto tim1 = std::chrono::high_resolution_clock::now();
- fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
+ LOG_INF("%s: tokenizing the input ..\n", __func__);
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
auto tim2 = std::chrono::high_resolution_clock::now();
- fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
+ LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
if (int(tokens.size()) < 2*n_ctx) {
- fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
+ LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
n_ctx);
- fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
+ LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
return {std::move(tokens), 0., {}, {}};
}
logits.reserve((size_t)n_ctx * n_vocab);
}
- fprintf(stderr, "%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
+ LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
}
if (llama_decode(ctx, batch)) {
- fprintf(stderr, "%s : failed to eval\n", __func__);
+ LOG_INF("%s : failed to eval\n", __func__);
return {tokens, -1, logit_history, prob_history};
}
llama_synchronize(ctx);
const auto t_end = std::chrono::high_resolution_clock::now();
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
- fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+ LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
int total_seconds = (int)(t_total*n_chunk/n_seq);
if (total_seconds >= 60*60) {
- fprintf(stderr, "%d hours ", total_seconds / (60*60));
+ LOG("%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60);
}
- fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
+ LOG("%.2f minutes\n", total_seconds / 60.0);
}
+ LOG("\n");
for (int seq = 0; seq < n_seq_batch; seq++) {
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx + first);
// perplexity is e^(average negative log-likelihood)
if (params.ppl_output_type == 0) {
- printf("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
+ LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
} else {
double av = nll/count;
double av2 = nll2/count - av*av;
if (av2 > 0) av2 = sqrt(av2/(count-1));
- printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
+ LOG("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
}
}
- fflush(stdout);
logits.clear();
}
- printf("\n");
+ LOG("\n");
nll2 /= count;
nll /= count;
nll2 -= nll * nll;
if (nll2 > 0) {
nll2 = sqrt(nll2/(count-1));
- printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
+ LOG_INF("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
} else {
- printf("Unexpected negative standard deviation of log(prob)\n");
+ LOG_ERR("Unexpected negative standard deviation of log(prob)\n");
}
llama_batch_free(batch);
const int ret = llama_decode(ctx, batch_view);
if (ret != 0) {
- LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
+ LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
return false;
}
}
if (prompt_lines.size() % 6 != 0) {
- fprintf(stderr, "%s : number of lines in prompt not a multiple of 6.\n", __func__);
+ LOG_ERR("%s : number of lines in prompt not a multiple of 6.\n", __func__);
return;
}
size_t hs_task_count = prompt_lines.size()/6;
- fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
+ LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
- fprintf(stderr, "================================= is_spm = %d\n", is_spm);
+ LOG_INF("================================= is_spm = %d\n", is_spm);
// The tasks should be randomized so the score stabilizes quickly.
bool randomize_tasks = true;
std::vector<llama_token> seq_tokens[4];
};
- fprintf(stderr, "%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") );
+ LOG_INF("%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") );
// Select and read data from prompt lines
std::vector<hs_data_t> hs_data(hs_task_count);
}
}
- fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__);
+ LOG_INF("%s : calculating hellaswag score over selected tasks.\n", __func__);
- printf("\ntask\tacc_norm\n");
+ LOG("\ntask\tacc_norm\n");
double acc = 0.0f;
}
if (i0 == i1) {
- fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
+ LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
return;
}
// decode all tasks [i0, i1)
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
- fprintf(stderr, "%s: llama_decode() failed\n", __func__);
+ LOG_ERR("%s: llama_decode() failed\n", __func__);
return;
}
}
}
- //printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
+ //LOG("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
// If the gold ending got the maximum logprobe add one accuracy point
if (ending_logprob_max_idx == hs_cur.gold_ending_idx) {
}
// Print the accumulated accuracy mean x 100
- printf("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
- fflush(stdout);
+ LOG("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
}
i0 = i1 - 1;
llama_batch_free(batch);
- printf("\n");
+ LOG("\n");
}
struct winogrande_entry {
}
}
if (ipos != 4) {
- printf("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
+ LOG_ERR("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
continue;
}
auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3)
if (sentence[where] == '_') break;
}
if (where == int(sentence.size())) {
- printf("%s: no _ in <%s>\n", __func__, sentence.c_str());
+ LOG_ERR("%s: no _ in <%s>\n", __func__, sentence.c_str());
continue;
}
std::istringstream stream(answer.c_str());
int i_answer; stream >> i_answer;
if (stream.fail() || i_answer < 1 || i_answer > 2) {
- printf("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
+ LOG_ERR("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
continue;
}
result.emplace_back();
auto data = load_winogrande_from_csv(params.prompt);
if (data.empty()) {
- fprintf(stderr, "%s: no tasks\n", __func__);
+ LOG_ERR("%s: no tasks\n", __func__);
return;
}
- fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, data.size());
+ LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, data.size());
if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) {
- fprintf(stderr, "%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
+ LOG_INF("%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
std::mt19937 rng(1);
std::vector<int> aux(data.size());
for (int i = 0; i < int(data.size()); ++i) {
data = std::move(selected);
}
- fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);
+ LOG_INF("%s : tokenizing selected tasks\n", __func__);
for (auto & task : data) {
task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
}
- fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
+ LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__);
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx);
}
if (i0 == i1) {
- fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
+ LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
return;
}
// decode all tasks [i0, i1)
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
- fprintf(stderr, "%s: llama_decode() failed\n", __func__);
+ LOG_ERR("%s: llama_decode() failed\n", __func__);
return;
}
++n_done;
// print the accumulated accuracy mean x 100
- printf("%zu\t%.4lf\t%10.6f %10.6f %d %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
- fflush(stdout);
+ LOG("%zu\t%.4lf\t%10.6f %10.6f %d %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
}
i0 = i1 - 1;
}
- printf("\n");
+ LOG("\n");
if (n_done < 100) return;
const float p = 1.f*n_correct/n_done;
const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1));
- printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
+
+ LOG_INF("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
}
static bool deserialize_string(std::istream & in, std::string & str) {
static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
if (task.question.empty() || task.mc1.answers.empty()) {
if (log_error) {
- printf("%s: found bad task with empty question and/or answers\n", __func__);
+ LOG_ERR("%s: found bad task with empty question and/or answers\n", __func__);
}
return false;
}
for (auto& answer : task.mc1.answers) {
if (answer.empty()) {
if (log_error) {
- printf("%s: found empty answer\n", __func__);
+ LOG_ERR("%s: found empty answer\n", __func__);
}
return false;
}
uint32_t n_task;
strstream.read((char *)&n_task, sizeof(n_task));
if (strstream.fail() || n_task == 0) {
- printf("%s: no tasks\n", __func__);
+ LOG_ERR("%s: no tasks\n", __func__);
return;
}
- printf("%s: there are %u tasks in prompt\n", __func__, n_task);
+ LOG_INF("%s: there are %u tasks in prompt\n", __func__, n_task);
std::vector<uint32_t> task_pos(n_task);
strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
if (strstream.fail()) {
- printf("%s: failed to read task positions from prompt\n", __func__);
+ LOG_ERR("%s: failed to read task positions from prompt\n", __func__);
return;
}
if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) {
// Use all tasks
tasks.resize(n_task);
- printf("%s: reading tasks", __func__);
+ LOG_INF("%s: reading tasks", __func__);
int n_dot = std::max((int) n_task/100, 1);
int i = 0;
for (auto& task : tasks) {
++i;
if (!task.deserialize(strstream)) {
- printf("%s: failed to read task %d of %u\n", __func__, i, n_task);
+ LOG_ERR("%s: failed to read task %d of %u\n", __func__, i, n_task);
return;
}
- if (i%n_dot == 0) printf(".");
+ if (i%n_dot == 0) LOG(".");
}
- printf("done\n");
+ LOG("done\n");
}
else {
- printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
+ LOG_INF("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
std::mt19937 rng(1);
std::vector<int> aux(n_task);
for (uint32_t i = 0; i < n_task; ++i) aux[i] = i;
aux.pop_back();
strstream.seekg(task_pos[idx], std::ios::beg);
if (!task.deserialize(strstream)) {
- printf("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
+ LOG_ERR("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
return;
}
}
n_task = params.multiple_choice_tasks;
}
- printf("%s: preparing task data", __func__);
- fflush(stdout);
+ LOG_INF("%s: preparing task data", __func__);
if (n_task > 500) {
- printf("...");
- fflush(stdout);
+ LOG("...");
std::atomic<int> counter(0);
std::atomic<int> n_bad(0);
auto prepare = [&counter, &n_bad, &tasks, ctx] () {
for (auto& w : workers) w = std::thread(prepare);
prepare();
for (auto& w : workers) w.join();
- printf("done\n");
- fflush(stdout);
+ LOG("done\n");
int nbad = n_bad;
if (nbad > 0) {
- printf("%s: found %d malformed tasks\n", __func__, nbad);
+ LOG_ERR("%s: found %d malformed tasks\n", __func__, nbad);
return;
}
} else {
return;
}
if (i_task%n_dot == 0) {
- printf(".");
- fflush(stdout);
+ LOG(".");
}
}
- printf("done\n");
+ LOG("done\n");
}
- printf("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
+ LOG_INF("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
- printf("\ntask\tacc_norm\n");
+ LOG("\ntask\tacc_norm\n");
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx);
}
if (i0 == i1) {
- fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
+ LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
return;
}
// decode all tasks [i0, i1)
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
- fprintf(stderr, "%s: llama_decode() failed\n", __func__);
+ LOG_ERR("%s: llama_decode() failed\n", __func__);
return;
}
// compute the logprobs for each ending of the decoded tasks
for (size_t i = i0; i < i1; ++i) {
auto & cur_task = tasks[i];
- //printf("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
+ //LOG("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
//for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) {
// if (cur_task.mc1.labels[j] == 1) {
- // printf("%d", j+1);
+ // LOG("%d", j+1);
// }
//}
- //printf("\n common_prefix: %zu\n", cur_task.common_prefix);
+ //LOG("\n common_prefix: %zu\n", cur_task.common_prefix);
// get the logits of the last token of the common prefix
std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float));
size_t count = 1;
float log_prob = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]);
for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
- //printf(" %zu %g\n", ir, eval_results[ir]);
+ //LOG(" %zu %g\n", ir, eval_results[ir]);
++count;
log_prob += eval_results[ir++];
}
cur_task.log_probs[s] = log_prob / count;
- //printf(" Final: %g\n", log_prob / count);
- //printf(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
+ //LOG(" Final: %g\n", log_prob / count);
+ //LOG(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
}
// Find the ending with maximum logprob
++n_done;
// Print the accumulated accuracy mean x 100
- printf("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
- fflush(stdout);
+ LOG("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
}
i0 = i1 - 1;
float p = 1.f*n_correct/n_done;
float sigma = sqrt(p*(1-p)/(n_done-1));
- printf("\n Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
+ LOG("\n");
+ LOG_INF("Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
p = 1.f*n_done/n_tot_answers;
sigma = sqrt(p*(1-p)/(n_done-1));
- printf("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
+ LOG_INF("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
- printf("\n");
+ LOG_INF("\n");
}
static void kl_divergence(llama_context * ctx, const gpt_params & params) {
if (params.logits_file.empty()) {
- fprintf(stderr, "%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
+ LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
return;
}
std::ifstream in(params.logits_file.c_str(), std::ios::binary);
if (!in) {
- fprintf(stderr, "%s: failed to open %s\n", __func__, params.logits_file.c_str());
+ LOG_ERR("%s: failed to open %s\n", __func__, params.logits_file.c_str());
return;
}
{
char check[9]; check[8] = 0;
in.read(check, 8);
if (in.fail() || strncmp("_logits_", check, 8) != 0) {
- fprintf(stderr, "%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
+ LOG_ERR("%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
return;
}
}
uint32_t n_ctx;
in.read((char *)&n_ctx, sizeof(n_ctx));
if (n_ctx > llama_n_ctx(ctx)) {
- fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
+ LOG_ERR("%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
__func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
}
in.read((char *)&n_vocab, sizeof(n_vocab));
in.read((char *)&n_chunk, sizeof(n_chunk));
if (in.fail()) {
- fprintf(stderr, "%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
+ LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
return;
}
if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
- fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
+ LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
}
std::vector<llama_token> tokens(n_ctx * n_chunk);
if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
- fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
+ LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
return;
}
const auto t_start = std::chrono::high_resolution_clock::now();
if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
- fprintf(stderr, "%s: failed reading log-probs for chunk %d\n", __func__, i);
+ LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i);
return;
}
// TODO: use llama_batch.logits instead of relying on logits_all == true
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
- fprintf(stderr, "%s : failed to eval\n", __func__);
+ LOG_ERR("%s : failed to eval\n", __func__);
return;
}
if (i == 0) {
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
- fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+ LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
int total_seconds = (int)(t_total * n_chunk);
if (total_seconds >= 60*60) {
- fprintf(stderr, "%d hours ", total_seconds / (60*60));
+ LOG("%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60);
}
- fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
-
- printf("\nchunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
+ LOG("%.2f minutes\n", total_seconds / 60.0);
}
+ LOG("\n");
+ LOG("chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
const int first = n_ctx/2;
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
p_diff_ptr += n_ctx - 1 - first;
kld_ptr += n_ctx - 1 - first;
- printf("%4d", i+1);
+ LOG("%4d", i+1);
auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
const double ppl_val = exp(log_ppl.first);
const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
- printf(" %9.4lf ± %9.4lf", ppl_val, ppl_unc);
+ LOG(" %9.4lf ± %9.4lf", ppl_val, ppl_unc);
auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
- printf(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
+ LOG(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
- printf(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
+ LOG(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
const double p_diff_rms_val = sqrt(p_diff_mse.first);
const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
- printf(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
+ LOG(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
double p_top_val = 1.*kld.n_same_top/kld.count;
double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
- printf(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
+ LOG(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
- printf("\n");
-
- fflush(stdout);
+ LOG("\n");
logits.clear();
}
- printf("\n");
+ LOG("\n");
if (kld.count < 100) return; // we do not wish to do statistics on so few values
std::sort(kld_values.begin(), kld_values.end());
std::sort(p_diff_values.begin(), p_diff_values.end());
- printf("====== Perplexity statistics ======\n");
+ LOG("====== Perplexity statistics ======\n");
auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
const double ppl_val = exp(log_ppl.first);
const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
- printf("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
+ LOG("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
const double ppl_base_val = exp(log_ppl_base.first);
const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 )
- printf("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
+ LOG("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
- // printf("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
+ // LOG("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second);
- printf("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
+ LOG("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
- printf("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
+ LOG("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
const double ppl_ratio_val = exp(log_ppl_ratio_val);
const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 )
- printf("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
+ LOG("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov;
const double ppl_diff_val = ppl_val - ppl_base_val;
const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov);
- printf("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
+ LOG("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
- printf("\n");
+ LOG("\n");
- printf("====== KL divergence statistics ======\n");
+ LOG("====== KL divergence statistics ======\n");
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
- printf("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
+ LOG("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
: kld_values[kld_values.size()/2];
return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)];
};
- printf("Maximum KLD: %10.6f\n", kld_values.back());
- printf("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f));
- printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
- printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
- printf("Median KLD: %10.6f\n", kld_median);
- printf("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f));
- printf(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f));
- printf(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f));
- printf("Minimum KLD: %10.6f\n", kld_values.front());
+ LOG("Maximum KLD: %10.6f\n", kld_values.back());
+ LOG("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f));
+ LOG("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
+ LOG("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
+ LOG("Median KLD: %10.6f\n", kld_median);
+ LOG("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f));
+ LOG(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f));
+ LOG(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f));
+ LOG("Minimum KLD: %10.6f\n", kld_values.front());
- printf("\n");
+ LOG("\n");
- printf("====== Token probability statistics ======\n");
+ LOG("====== Token probability statistics ======\n");
auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count);
- printf("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second);
+ LOG("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second);
auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1])
: p_diff_values[p_diff_values.size()/2];
- printf("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back());
- printf("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
- printf("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
- printf("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
- printf("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
- printf("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
- printf("Median Δp: %6.3lf%%\n", 100.0*p_diff_median);
- printf("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
- printf("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
- printf(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
- printf(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
- printf(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
- printf("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front());
+ LOG("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back());
+ LOG("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
+ LOG("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
+ LOG("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
+ LOG("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
+ LOG("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
+ LOG("Median Δp: %6.3lf%%\n", 100.0*p_diff_median);
+ LOG("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
+ LOG("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
+ LOG(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
+ LOG(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
+ LOG(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
+ LOG("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front());
auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
- // printf("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
+ // LOG("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
const double p_diff_rms_val = sqrt(p_diff_mse.first);
const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
- printf("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
+ LOG("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
const double same_top_p = 1.0*kld.n_same_top/kld.count;
- printf("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
-
+ LOG("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
}
int main(int argc, char ** argv) {
return 1;
}
+ gpt_init();
+
const int32_t n_ctx = params.n_ctx;
if (n_ctx <= 0) {
- fprintf(stderr, "%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
+ LOG_ERR("%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
return 1;
}
}
if (params.ppl_stride > 0) {
- fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
+ LOG_INF("Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
params.n_ctx, params.n_ctx + params.ppl_stride/2);
params.n_ctx += params.ppl_stride/2;
}
- print_build_info();
-
llama_backend_init();
llama_numa_init(params.numa);
llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
if (model == NULL) {
- fprintf(stderr, "%s: error: unable to load model\n", __func__);
+ LOG_ERR("%s: unable to load model\n", __func__);
return 1;
}
const int n_ctx_train = llama_n_ctx_train(model);
if (params.n_ctx > n_ctx_train) {
- fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
+ LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, params.n_ctx);
}
// print system information
{
- fprintf(stderr, "\n");
- fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+ LOG_INF("\n");
+ LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
}
struct results_perplexity results;
results = perplexity(ctx, params, n_ctx);
}
- LOG_TEE("\n");
+ LOG("\n");
llama_perf_context_print(ctx);
+
write_logfile(ctx, params, model, results);
llama_free(ctx);
#include "arg.h"
#include "common.h"
+#include "log.h"
#include "llama.h"
#include <algorithm>
#include <fstream>
+#include <iostream> // TODO: remove me
static void print_usage(int, char ** argv) {
- LOG_TEE("\nexample usage:\n");
- LOG_TEE("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
- LOG_TEE("\n");
+ LOG("\nexample usage:\n");
+ LOG("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
+ LOG("\n");
}
struct chunk {
// original file position
size_t filepos;
// original text data
- std::string textdata = "";
+ std::string textdata;
// tokenized text data
std::vector<llama_token> tokens;
// embedding
std::ifstream f(filename.c_str());
if (!f.is_open()) {
- fprintf(stderr, "Error: could not open file %s\n", filename.c_str());
+ LOG_ERR("could not open file %s\n", filename.c_str());
return chunks;
}
chunk current_chunk;
char buffer[1024];
int64_t filepos = 0;
- std::string current = "";
+ std::string current;
while (f.read(buffer, 1024)) {
current += std::string(buffer, f.gcount());
size_t pos;
llama_kv_cache_clear(ctx);
// run model
- fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
+ LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
if (llama_decode(ctx, batch) < 0) {
- fprintf(stderr, "%s : failed to decode\n", __func__);
+ LOG_ERR("%s : failed to decode\n", __func__);
}
for (int i = 0; i < batch.n_tokens; i++) {
if (embd == NULL) {
embd = llama_get_embeddings_ith(ctx, i);
if (embd == NULL) {
- fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
+ LOG_ERR("%s: failed to get embeddings for token %d\n", __func__, i);
continue;
}
}
return 1;
}
+ gpt_init();
+
// For BERT models, batch size must be equal to ubatch size
params.n_ubatch = params.n_batch;
params.embedding = true;
if (params.chunk_size <= 0) {
- fprintf(stderr, "chunk_size must be positive\n");
+ LOG_ERR("chunk_size must be positive\n");
return 1;
}
if (params.context_files.empty()) {
- fprintf(stderr, "context_files must be specified\n");
+ LOG_ERR("context_files must be specified\n");
return 1;
}
- print_build_info();
-
- printf("processing files:\n");
+ LOG_INF("processing files:\n");
for (auto & context_file : params.context_files) {
- printf("%s\n", context_file.c_str());
+ LOG_INF("%s\n", context_file.c_str());
}
std::vector<chunk> chunks;
std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
}
- printf("Number of chunks: %ld\n", chunks.size());
+ LOG_INF("Number of chunks: %ld\n", chunks.size());
llama_backend_init();
llama_numa_init(params.numa);
llama_context * ctx = llama_init.context;
if (model == NULL) {
- fprintf(stderr, "%s: error: unable to load model\n", __func__);
+ LOG_ERR("%s: unable to load model\n", __func__);
return 1;
}
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
- fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
+ LOG_ERR("%s: pooling type NONE not supported\n", __func__);
return 1;
}
if (n_ctx > n_ctx_train) {
- fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
+ LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, n_ctx);
}
// print system information
{
- fprintf(stderr, "\n");
- fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+ LOG_INF("\n");
+ LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
}
// max batch size
for (auto & chunk : chunks) {
auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false);
if (inp.size() > n_batch) {
- fprintf(stderr, "%s: error: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
+ LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
__func__, (long long int) inp.size(), (long long int) n_batch);
return 1;
}
// tokenization stats
if (params.verbose_prompt) {
for (int i = 0; i < (int) chunks.size(); i++) {
- fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
- fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
+ LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
+ LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
- fprintf(stderr, "%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
+ LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
}
- fprintf(stderr, "\n\n");
+ LOG_INF("\n\n");
}
}
// start loop, receive query and return top k similar chunks based on cosine similarity
std::string query;
while (true) {
- printf("Enter query: ");
+ LOG("Enter query: ");
std::getline(std::cin, query);
std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
return a.second > b.second;
});
- printf("Top %d similar chunks:\n", params.sparams.top_k);
+ LOG("Top %d similar chunks:\n", params.sparams.top_k);
for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) {
- printf("filename: %s\n", chunks[similarities[i].first].filename.c_str());
- printf("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
- printf("similarity: %f\n", similarities[i].second);
- printf("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
- printf("--------------------\n");
+ LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str());
+ LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
+ LOG("similarity: %f\n", similarities[i].second);
+ LOG("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
+ LOG("--------------------\n");
}
}
}
- LOG_TEE("\n");
+ LOG("\n");
llama_perf_context_print(ctx);
// clean up
set(TARGET llama-server)
-option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
-option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
+
+option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
add_executable(${TARGET} ${TARGET_SRCS})
install(TARGETS ${TARGET} RUNTIME)
-target_compile_definitions(${TARGET} PRIVATE
- SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
-)
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
| `-to, --timeout N` | server read/write timeout in seconds (default: 600) |
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
| `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
-| `--log-format {text, json}` | log output format: json or text (default: json) |
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
| `--no-slots` | disables slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
--parallel 8 \
--batch-size 512 \
--ctx-size 4096 \
- --log-format text \
-ngl 33
```
server_args.append('--cont-batching')
server_args.append('--metrics')
server_args.append('--flash-attn')
- server_args.extend(['--log-format', "text"])
args = [str(arg) for arg in [server_path, *server_args]]
print(f"bench: starting server with: {' '.join(args)}")
pkwargs = {
#include "arg.h"
#include "common.h"
+#include "log.h"
#include "sampling.h"
#include "json-schema-to-grammar.h"
#include "llama.h"
#include "loading.html.hpp"
#include <atomic>
-#include <chrono>
#include <condition_variable>
#include <cstddef>
+#include <cinttypes>
+#include <deque>
+#include <memory>
#include <mutex>
-#include <thread>
#include <signal.h>
-#include <memory>
-#include <unordered_set>
+#include <thread>
#include <unordered_map>
-#include <deque>
+#include <unordered_set>
-using json = nlohmann::ordered_json;
+#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
+#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
+#define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
+#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
-bool server_verbose = false;
-bool server_log_json = true;
+#define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+
+#define QUE_INF(fmt, ...) LOG_INF("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define QUE_WRN(fmt, ...) LOG_WRN("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+
+using json = nlohmann::ordered_json;
enum stop_type {
STOP_TYPE_FULL,
std::function<void(int)> callback_on_release;
void reset() {
+ SLT_DBG(*this, "%s", "\n");
+
n_prompt_tokens = 0;
generated_text = "";
truncated = false;
return state != SLOT_STATE_IDLE;
}
- void add_token_string(const completion_token_output & token) {
+ void add_token(const completion_token_output & token) {
if (!is_processing()) {
+ SLT_WRN(*this, "%s", "slot is not processing\n");
return;
}
generated_token_probs.push_back(token);
void release() {
if (is_processing()) {
+ SLT_INF(*this, "stop processing: n_past = %d, truncated = %d\n", n_past, truncated);
+
t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
state = SLOT_STATE_IDLE;
- LOG_INFO("slot released", {
- {"id_slot", id},
- {"id_task", id_task},
- {"n_past", n_past},
- {"truncated", truncated},
- });
callback_on_release(id);
}
}
}
void print_timings() const {
- char buffer[512];
-
- double t_token = t_prompt_processing / n_prompt_tokens_processed;
- double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
-
- snprintf(buffer, 512, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
- t_prompt_processing, n_prompt_tokens_processed,
- t_token, n_tokens_second);
-
- LOG_INFO(buffer, {
- {"id_slot", id},
- {"id_task", id_task},
- {"t_prompt_processing", t_prompt_processing},
- {"n_prompt_tokens_processed", n_prompt_tokens_processed},
- {"t_token", t_token},
- {"n_tokens_second", n_tokens_second},
- });
-
- t_token = t_token_generation / n_decoded;
- n_tokens_second = 1e3 / t_token_generation * n_decoded;
-
- snprintf(buffer, 512, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
- t_token_generation, n_decoded,
- t_token, n_tokens_second);
-
- LOG_INFO(buffer, {
- {"id_slot", id},
- {"id_task", id_task},
- {"t_token_generation", t_token_generation},
- {"n_decoded", n_decoded},
- {"t_token", t_token},
- {"n_tokens_second", n_tokens_second},
- });
-
- snprintf(buffer, 512, " total time = %10.2f ms", t_prompt_processing + t_token_generation);
-
- LOG_INFO(buffer, {
- {"id_slot", id},
- {"id_task", id_task},
- {"t_prompt_processing", t_prompt_processing},
- {"t_token_generation", t_token_generation},
- {"t_total", t_prompt_processing + t_token_generation},
- });
+ const double t_prompt = t_prompt_processing / n_prompt_tokens_processed;
+ const double n_prompt_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
+
+ const double t_gen = t_token_generation / n_decoded;
+ const double n_gen_second = 1e3 / t_token_generation * n_decoded;
+
+ SLT_INF(*this,
+ "\n"
+ "\rprompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
+ "\r eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
+ "\r total time = %10.2f ms / %5d tokens\n",
+ t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
+ t_token_generation, n_decoded, t_gen, n_gen_second,
+ t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
}
};
std::unique_lock<std::mutex> lock(mutex_tasks);
if (task.id == -1) {
task.id = id++;
- LOG_VERBOSE("new task id", {{"new_id", task.id}});
}
+ QUE_DBG("new task, id = %d, front = %d\n", task.id, front);
if (front) {
queue_tasks.push_front(std::move(task));
} else {
for (auto & task : tasks) {
if (task.id == -1) {
task.id = id++;
- LOG_VERBOSE("new task id", {{"new_id", task.id}});
}
+ QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front);
if (front) {
queue_tasks.push_front(std::move(task));
} else {
// Add a new task, but defer until one slot is available
void defer(server_task task) {
std::unique_lock<std::mutex> lock(mutex_tasks);
+ QUE_DBG("defer task, id = %d\n", task.id);
queue_tasks_deferred.push_back(std::move(task));
condition_tasks.notify_one();
}
int get_new_id() {
std::unique_lock<std::mutex> lock(mutex_tasks);
int new_id = id++;
- LOG_VERBOSE("new task id", {{"new_id", new_id}});
return new_id;
}
running = true;
while (true) {
- LOG_VERBOSE("new task may arrive", {});
+ QUE_DBG("%s", "processing new tasks\n");
while (true) {
std::unique_lock<std::mutex> lock(mutex_tasks);
server_task task = queue_tasks.front();
queue_tasks.pop_front();
lock.unlock();
- LOG_VERBOSE("callback_new_task", {{"id_task", task.id}});
+
+ QUE_DBG("processing task, id = %d\n", task.id);
callback_new_task(task);
}
// all tasks in the current loop is processed, slots data is now ready
- LOG_VERBOSE("callback_update_slots", {});
+ QUE_DBG("%s", "update slots\n");
callback_update_slots();
- LOG_VERBOSE("wait for new task", {});
+ QUE_DBG("%s", "waiting for new tasks\n");
{
std::unique_lock<std::mutex> lock(mutex_tasks);
if (queue_tasks.empty()) {
if (!running) {
- LOG_VERBOSE("ending start_loop", {});
+ QUE_DBG("%s", "terminate\n");
return;
}
condition_tasks.wait(lock, [&]{
// add the id_task to the list of tasks waiting for response
void add_waiting_task_id(int id_task) {
- LOG_VERBOSE("waiting for task id", {{"id_task", id_task}});
+ SRV_DBG("waiting for task id = %d\n", id_task);
std::unique_lock<std::mutex> lock(mutex_results);
waiting_task_ids.insert(id_task);
// when the request is finished, we can remove task associated with it
void remove_waiting_task_id(int id_task) {
- LOG_VERBOSE("remove waiting for task id", {{"id_task", id_task}});
+ SRV_DBG("task id = %d is done\n", id_task);
std::unique_lock<std::mutex> lock(mutex_results);
waiting_task_ids.erase(id_task);
// Send a new result to a waiting id_task
void send(server_task_result & result) {
- LOG_VERBOSE("send new result", {{"id_task", result.id}});
+ SRV_DBG("sending result for task id = %d\n", result.id);
std::unique_lock<std::mutex> lock(mutex_results);
for (const auto & id_task : waiting_task_ids) {
if (result.id == id_task) {
- LOG_VERBOSE("queue_results.push_back", {{"id_task", id_task}});
+ SRV_DBG("task id = %d moved to result queue\n", result.id);
+
queue_results.push_back(std::move(result));
condition_results.notify_all();
return;
struct server_context {
llama_model * model = nullptr;
llama_context * ctx = nullptr;
- std::vector<llama_lora_adapter_container> lora_adapters;
+ std::vector<llama_lora_adapter_container> loras;
gpt_params params;
llama_init_result llama_init = llama_init_from_gpt_params(params);
model = llama_init.model;
- ctx = llama_init.context;
- lora_adapters = llama_init.lora_adapters;
+ ctx = llama_init.context;
+ loras = llama_init.lora_adapters;
+
params.n_parallel -= 1; // but be sneaky about it
+
if (model == nullptr) {
- LOG_ERROR("unable to load model", {{"model", params.model}});
+ SRV_ERR("failed to load model, '%s'\n", params.model.c_str());
return false;
}
void init() {
const int32_t n_ctx_slot = n_ctx / params.n_parallel;
- LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}});
+ SRV_INF("initializing slots, n_slots = %d\n", params.n_parallel);
for (int i = 0; i < params.n_parallel; i++) {
server_slot slot;
slot.n_ctx = n_ctx_slot;
slot.n_predict = params.n_predict;
- LOG_INFO("new slot", {
- {"id_slot", slot.id},
- {"n_ctx_slot", slot.n_ctx}
- });
+ SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
const int ga_n = params.grp_attn_n;
const int ga_w = params.grp_attn_w;
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
- LOG_INFO("slot self-extend", {
- {"id_slot", slot.id},
- {"ga_n", ga_n},
- {"ga_w", ga_w}
- });
+ SLT_INF(slot, "slot self-extend: ga_n = %d, ga_w = %d\n", ga_n, ga_w);
}
slot.ga_i = 0;
}
if (ret != nullptr) {
- LOG_VERBOSE("selected slot by lcp similarity", {
- {"id_slot", ret->id},
- {"max_lcp_len", max_lcp_len},
- {"similarity", similarity},
- });
+ SLT_DBG(*ret, "selected slot by lcp similarity, max_lcp_len = %d, similarity = %f\n", max_lcp_len, similarity);
}
}
}
if (ret != nullptr) {
- LOG_VERBOSE("selected slot by lru", {
- {"id_slot", ret->id},
- {"t_last", t_last},
- });
+ SLT_DBG(*ret, "selected slot by lru, t_last = %" PRId64 "\n", t_last);
}
}
}
if (slot.params.cache_prompt && slot.ga_n != 1) {
- LOG_WARNING("cache_prompt is not supported with group-attention", {});
slot.params.cache_prompt = false;
+ SLT_WRN(slot, "%s", "group-attention is not supported with prompt caching. disabling cache\n");
}
if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
// Might be better to reject the request with a 400 ?
- LOG_WARNING("Max tokens to predict exceeds server configuration", {
- {"params.n_predict", slot.params.n_predict},
- {"slot.n_predict", slot.n_predict},
- });
slot.params.n_predict = slot.n_predict;
+ SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.n_predict, slot.n_predict);
}
// infill
slot.state = SLOT_STATE_PROCESSING_PROMPT;
slot.prompt_tokens.clear();
- LOG_INFO("slot is processing task", {
- {"id_slot", slot.id},
- {"id_task", slot.id_task},
- });
+ SLT_INF(slot, "%s", "processing task\n");
return true;
}
void kv_cache_clear() {
- LOG_VERBOSE("clearing KV cache", {});
+ SRV_DBG("%s", "clearing KV cache\n");
// clear the entire KV cache
llama_kv_cache_clear(ctx);
}
void system_prompt_update() {
- LOG_VERBOSE("system prompt update", {
- {"system_prompt", system_prompt},
- });
+ SRV_DBG("updating system prompt: '%s'\n", system_prompt.c_str());
kv_cache_clear();
system_tokens.clear();
}
if (llama_decode(ctx, batch) != 0) {
- LOG_ERROR("llama_decode() failed", {});
+ SRV_ERR("%s", "llama_decode() failed\n");
return;
}
}
}
bool system_prompt_set(const std::string & sys_prompt) {
- system_prompt = sys_prompt;
+ SRV_DBG("system prompt set: '%s'\n", system_prompt.c_str());
- LOG_VERBOSE("system prompt process", {
- {"system_prompt", system_prompt},
- });
+ system_prompt = sys_prompt;
// release all slots
for (server_slot & slot : slots) {
// add the token to slot queue and cache
}
- slot.add_token_string(result);
+ slot.add_token(result);
if (slot.params.stream) {
send_partial_response(slot, result);
}
slot.stopped_limit = true;
slot.has_next_token = false;
- LOG_VERBOSE("stopped by limit", {
- {"id_slot", slot.id},
- {"id_task", slot.id_task},
- {"n_decoded", slot.n_decoded},
- {"n_predict", slot.params.n_predict},
- });
+ SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict);
}
if (llama_token_is_eog(model, result.tok)) {
slot.stopped_eos = true;
slot.has_next_token = false;
- LOG_VERBOSE("eos token found", {});
- }
-
- auto n_ctx_train = llama_n_ctx_train(model);
- if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1
- && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
- LOG_WARNING("n_predict is not set and self-context extend is disabled."
- " Limiting generated tokens to n_ctx_train to avoid EOS-less generation infinite loop", {
- { "id_slot", slot.id },
- { "params.n_predict", slot.params.n_predict },
- { "slot.n_prompt_tokens", slot.n_prompt_tokens },
- { "slot.n_decoded", slot.n_decoded },
- { "slot.n_predict", slot.n_predict },
- { "n_slots", params.n_parallel },
- { "slot.n_ctx", slot.n_ctx },
- { "n_ctx", n_ctx },
- { "n_ctx_train", n_ctx_train },
- { "ga_n", slot.ga_n },
- });
+ SLT_DBG(slot, "%s", "stopped by EOS\n");
+ }
+
+ const auto n_ctx_train = llama_n_ctx_train(model);
+
+ if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
slot.truncated = true;
slot.stopped_limit = true;
slot.has_next_token = false; // stop prediction
+
+ SLT_WRN(slot,
+ "n_predict (%d) is not set and self-context extend is disabled. "
+ "Limiting generated tokens to n_ctx_train (%d) to avoid EOS-less generation infinite loop\n",
+ slot.params.n_predict, n_ctx_train);
}
- LOG_VERBOSE("next token", {
- {"id_slot", slot.id},
- {"id_task", slot.id_task},
- {"token", result.tok},
- {"token_text", tokens_to_output_formatted_string(ctx, result.tok)},
- {"has_next_token", slot.has_next_token},
- {"n_remain", slot.n_remaining},
- {"n_decoded", slot.n_decoded},
- {"stopped_eos", slot.stopped_eos},
- {"stopped_word", slot.stopped_word},
- {"stopped_limit", slot.stopped_limit},
- {"stopping_word", slot.stopping_word},
- });
+ SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: '%s'\n", slot.n_decoded, slot.n_remaining, token_str.c_str());
return slot.has_next_token; // continue
}
}
void send_error(const int id_task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
- LOG_ERROR("task error", {
- {"id_task", id_task},
- {"error", error},
- });
+ SRV_ERR("task id = %d, error: %s\n", id_task, error.c_str());
server_task_result res;
res.id = id_task;
}
if (embd == NULL) {
- LOG_ERROR("failed to get embeddings", {
- {"token", batch.token [i]},
- {"seq_id", batch.seq_id[i][0]}
- });
+ SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
res.data = json {
{"embedding", std::vector<float>(n_embd, 0.0f)},
};
}
+ SLT_DBG(slot, "%s", "sending embeddings\n");
+
queue_results.send(res);
}
task.type = SERVER_TASK_TYPE_COMPLETION;
if (replace_prompt) {
task.data = task_data;
- task.data["prompt"] = prompt;
+ task.data["prompt"] = std::move(prompt);
} else {
task.data = std::move(task_data);
}
std::vector<server_task> cancel_tasks;
cancel_tasks.reserve(id_tasks.size());
for (const auto & id_task : id_tasks) {
- LOG_VERBOSE("cancel task", {{"id_task", id_task}});
+ SRV_WRN("cancel task, id_task = %d\n", id_task);
+
server_task task;
task.type = SERVER_TASK_TYPE_CANCEL;
task.id_target = id_task;
}
// receive the results from task(s) created by create_tasks_cmpl
- void receive_cmpl_results(const std::unordered_set<int> & id_tasks, std::function<void(std::vector<server_task_result>&)> result_handler, std::function<void(json)> error_handler) {
+ void receive_cmpl_results(
+ const std::unordered_set<int> & id_tasks,
+ const std::function<void(std::vector<server_task_result>&)> & result_handler,
+ const std::function<void(json)> & error_handler) {
// TODO: currently, there is no way to detect the client has cancelled the request
std::vector<server_task_result> results(id_tasks.size());
for (size_t i = 0; i < id_tasks.size(); i++) {
}
// receive the results from task(s) created by create_tasks_cmpl, in stream mode
- void receive_cmpl_results_stream(const std::unordered_set<int> & id_tasks, std::function<bool(server_task_result&)> result_handler, std::function<void(json)> error_handler) {
+ void receive_cmpl_results_stream(
+ const std::unordered_set<int> & id_tasks, const
+ std::function<bool(server_task_result&)> & result_handler, const
+ std::function<void(json)> & error_handler) {
size_t n_finished = 0;
while (true) {
server_task_result result = queue_results.recv(id_tasks);
if (slot == nullptr) {
// if no slot is available, we defer this task for processing later
- LOG_VERBOSE("no slot is available", {{"id_task", task.id}});
+ SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id);
queue_tasks.defer(task);
break;
}
if (slot->is_processing()) {
// if requested slot is unavailable, we defer this task for processing later
- LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+ SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
queue_tasks.defer(task);
break;
}
slot->index = json_value(task.data, "index", 0);
if (!launch_slot_with_task(*slot, task)) {
- LOG_ERROR("error while launching slot", task.data);
+ SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id);
break;
}
} break;
slots_data.push_back(slot_data);
}
- LOG_INFO("slot data", {
- {"id_task", task.id},
- {"n_idle_slots", n_idle_slots},
- {"n_processing_slots", n_processing_slots}
- });
-
- LOG_VERBOSE("slot data", {
- {"id_task", task.id},
- {"n_idle_slots", n_idle_slots},
- {"n_processing_slots", n_processing_slots},
- {"slots", slots_data}
- });
+ SRV_DBG("n_idle_slots = %d, n_processing_slots = %d\n", n_idle_slots, n_processing_slots);
server_task_result res;
res.id = task.id;
}
if (slot->is_processing()) {
// if requested slot is unavailable, we defer this task for processing later
- LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+ SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
queue_tasks.defer(task);
break;
}
}
if (slot->is_processing()) {
// if requested slot is unavailable, we defer this task for processing later
- LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+ SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
queue_tasks.defer(task);
break;
}
}
if (slot->is_processing()) {
// if requested slot is unavailable, we defer this task for processing later
- LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+ SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
queue_tasks.defer(task);
break;
}
} break;
case SERVER_TASK_TYPE_SET_LORA:
{
- llama_lora_adapters_apply(ctx, lora_adapters);
+ llama_lora_adapters_apply(ctx, loras);
server_task_result result;
result.id = task.id;
result.stop = true;
}
if (all_idle) {
- LOG_INFO("all slots are idle", {});
+ SRV_INF("%s", "all slots are idle\n");
if (system_prompt.empty() && clean_kv_cache) {
kv_cache_clear();
}
}
{
- LOG_VERBOSE("posting NEXT_RESPONSE", {});
+ SRV_DBG("%s", "posting NEXT_RESPONSE\n");
server_task task;
task.type = SERVER_TASK_TYPE_NEXT_RESPONSE;
const int n_left = (int) system_tokens.size() + slot.n_past - n_keep;
const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2);
- LOG_INFO("slot context shift", {
- {"id_slot", slot.id},
- {"id_task", slot.id_task},
- {"n_keep", n_keep},
- {"n_left", n_left},
- {"n_discard", n_discard},
- {"n_ctx", n_ctx},
- {"n_past", slot.n_past},
- {"n_system_tokens", system_tokens.size()},
- {"n_cache_tokens", slot.cache_tokens.size()}
- });
+ SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard);
llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
slot.cache_tokens.push_back(slot.sampled);
}
- LOG_VERBOSE("slot decode token", {
- {"id_slot", slot.id},
- {"id_task", slot.id_task},
- {"n_ctx", n_ctx},
- {"n_past", slot.n_past},
- {"n_system_tokens", system_tokens.size()},
- {"n_cache_tokens", slot.cache_tokens.size()},
- {"truncated", slot.truncated}
- });
+ SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_system_tokens = %d, n_cache_tokens = %d, truncated = %d\n",
+ slot.n_ctx, slot.n_past, (int) system_tokens.size(), (int) slot.cache_tokens.size(), slot.truncated);
}
// process in chunks of params.n_batch
// we haven't tokenized the prompt yet - do it now:
if (prompt_tokens.empty()) {
- LOG_VERBOSE("tokenizing prompt", {
- {"id_slot", slot.id},
- {"id_task", slot.id_task}
- });
+ SLT_INF(slot, "tokenizing prompt, len = %d\n", (int) slot.prompt.size());
slot.t_start_process_prompt = ggml_time_us();
slot.t_start_generation = 0;
slot.n_past = 0;
slot.n_prompt_tokens = prompt_tokens.size();
- LOG_VERBOSE("prompt tokenized", {
- {"id_slot", slot.id},
- {"id_task", slot.id_task},
- {"n_ctx", slot.n_ctx},
- {"n_keep", slot.params.n_keep},
- {"n_prompt_tokens", slot.n_prompt_tokens},
- {"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())},
- });
+ SLT_INF(slot, "prompt tokenized, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens);
// empty prompt passed -> release the slot and send empty response
if (prompt_tokens.empty()) {
- LOG_INFO("empty prompt - releasing slot", {
- {"id_slot", slot.id},
- {"id_task", slot.id_task}
- });
+ SLT_WRN(slot, "%s", "empty prompt - releasing slot\n");
slot.release();
slot.print_timings();
slot.truncated = true;
slot.n_prompt_tokens = prompt_tokens.size();
- LOG_VERBOSE("input truncated", {
- {"id_slot", slot.id},
- {"id_task", slot.id_task},
- {"n_ctx", slot.n_ctx},
- {"n_keep", slot.params.n_keep},
- {"n_left", n_left},
- {"n_prompt_tokens", slot.n_prompt_tokens},
- {"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())},
- });
+ SLT_WRN(slot, "input truncated, n_ctx = %d, n_keep = %d, n_left = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, n_left, slot.n_prompt_tokens);
GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
}
if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) {
// we have to evaluate at least 1 token to generate logits.
- LOG_INFO("we have to evaluate at least 1 token to generate logits", {
- { "id_slot", slot.id },
- { "id_task", slot.id_task }
- });
+ SLT_WRN(slot, "need to evaluate at least 1 token to generate logits, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens);
slot.n_past--;
if (slot.ga_i > 0) {
// remove the non-common part from the cache
slot.cache_tokens.resize(slot.n_past);
- LOG_INFO("kv cache rm [p0, end)", {
- { "id_slot", slot.id },
- { "id_task", slot.id_task },
- { "p0", p0 }
- });
+ SLT_INF(slot, "kv cache rm [%d, end)\n", p0);
int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
slot_npast++;
}
- LOG_VERBOSE("prompt processing progress", {
- {"id_slot", slot.id},
- {"n_past", slot.n_past},
- {"n_ctx", n_ctx},
- {"n_tokens", batch.n_tokens},
- {"progress", (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens},
- });
+ SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
// entire prompt has been processed
if (slot.n_past == slot.n_prompt_tokens) {
slot.n_decoded = 0;
slot.i_batch = batch.n_tokens - 1;
- LOG_VERBOSE("prompt done", {
- {"id_slot", slot.id},
- {"n_past", slot.n_past},
- {"n_ctx", n_ctx},
- {"n_tokens", batch.n_tokens},
- });
+ SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch.n_tokens);
}
}
}
if (batch.n_tokens == 0) {
- LOG_VERBOSE("no tokens to decode", {});
+ SRV_WRN("%s", "no tokens to decode\n");
return;
}
- LOG_VERBOSE("decoding batch", {
- {"n_tokens", batch.n_tokens},
- });
+ SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);
// make sure we're in the right embedding mode
llama_set_embeddings(ctx, batch_type == 1);
const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;
- LOG_TEE("\n");
- LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
- LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
- LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
+ SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
+ SLT_DBG(slot, "div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
+ SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i, slot.n_past_se, ib * bd);
llama_kv_cache_seq_div(ctx, slot.id + 1, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n);
slot.ga_i += slot.ga_w / slot.ga_n;
- LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
+ SLT_DBG(slot, "\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
}
slot.n_past_se += n_tokens;
if (ret != 0) {
if (n_batch == 1 || ret < 0) {
// if you get here, it means the KV cache is full - try increasing it via the context size
- LOG_ERROR("failed to decode the batch: KV cache is full - try increasing it via the context size", {
- {"i", i},
- {"n_batch", n_batch},
- {"ret", ret},
- });
+ SRV_ERR("failed to decode the batch: KV cache is full - try increasing it via the context size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret);
for (auto & slot : slots) {
slot.release();
send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size.");
n_batch /= 2;
i -= n_batch;
- LOG_WARNING("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation", {
- {"i", i},
- {"n_batch", n_batch},
- {"ret", ret},
- });
+ SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret);
continue; // continue loop of n_batch
}
}
}
- LOG_VERBOSE("run slots completed", {});
+ SRV_DBG("%s", "run slots completed\n");
}
json model_meta() const {
return;
}
- LOG_INFO("request", {
- {"remote_addr", req.remote_addr},
- {"remote_port", req.remote_port},
- {"status", res.status},
- {"method", req.method},
- {"path", req.path},
- {"params", req.params},
- });
+ //LOG_INFO("request", {
+ // {"remote_addr", req.remote_addr},
+ // {"remote_port", req.remote_port},
+ // {"status", res.status},
+ // {"method", req.method},
+ // {"path", req.path},
+ // {"params", req.params},
+ //});
+ LOG_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
- LOG_VERBOSE("request", {
- {"request", req.body},
- {"response", res.body},
- });
+ LOG_DBG("request: %s\n", req.body.c_str());
+ LOG_DBG("response: %s\n", res.body.c_str());
}
std::function<void(int)> shutdown_handler;
}
int main(int argc, char ** argv) {
-#if SERVER_VERBOSE != 1
- log_disable();
-#endif
// own arguments required by this example
gpt_params params;
return 1;
}
- // TODO: not great to use extern vars
- server_log_json = params.log_json;
- server_verbose = params.verbosity > 0;
+ gpt_init();
+
+ // enabling this will output extra debug information in the HTTP responses from the server
+ // see format_final_response_oaicompat()
+ const bool verbose = params.verbosity > 9;
// struct that contains llama context and inference
server_context ctx_server;
llama_backend_init();
llama_numa_init(params.numa);
- LOG_INFO("build info", {
- {"build", LLAMA_BUILD_NUMBER},
- {"commit", LLAMA_COMMIT}
- });
-
- LOG_INFO("system info", {
- {"n_threads", params.cpuparams.n_threads},
- {"n_threads_batch", params.cpuparams_batch.n_threads},
- {"total_threads", std::thread::hardware_concurrency()},
- {"system_info", llama_print_system_info()},
- });
+ LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency());
+ LOG_INF("\n");
+ LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+ LOG_INF("\n");
std::unique_ptr<httplib::Server> svr;
#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
svr->set_logger(log_server_request);
- auto res_error = [](httplib::Response & res, json error_data) {
+ auto res_error = [](httplib::Response & res, const json & error_data) {
json final_response {{"error", error_data}};
res.set_content(final_response.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON);
res.status = json_value(error_data, "code", 500);
};
- auto res_ok = [](httplib::Response & res, json data) {
+ auto res_ok = [](httplib::Response & res, const json & data) {
res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON);
res.status = 200;
};
svr->set_exception_handler([&res_error](const httplib::Request &, httplib::Response & res, std::exception_ptr ep) {
std::string message;
try {
- std::rethrow_exception(std::move(ep));
+ std::rethrow_exception(ep);
} catch (std::exception & e) {
message = e.what();
} catch (...) {
}
json formatted_error = format_error_response(message, ERROR_TYPE_SERVER);
- LOG_VERBOSE("Got exception", formatted_error);
+ LOG_WRN("got exception: %s\n", formatted_error.dump().c_str());
res_error(res, formatted_error);
});
// API key is invalid or not provided
res_error(res, format_error_response("Invalid API Key", ERROR_TYPE_AUTHENTICATION));
- LOG_WARNING("Unauthorized: Invalid API Key", {});
+ LOG_WRN("Unauthorized: Invalid API Key\n");
return false;
};
}
res_ok(res, arr);
}
- }, [&](json error_data) {
+ }, [&](const json & error_data) {
res_error(res, error_data);
});
} else {
const auto chunked_content_provider = [task_ids, &ctx_server](size_t, httplib::DataSink & sink) {
- ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result result) -> bool {
+ ctx_server.receive_cmpl_results_stream(task_ids, [&](const server_task_result & result) -> bool {
return server_sent_event(sink, "data", result.data);
- }, [&](json error_data) {
+ }, [&](const json & error_data) {
server_sent_event(sink, "error", error_data);
});
sink.done();
};
// TODO: maybe merge this function with "handle_completions_generic"
- const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
+ const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) {
if (ctx_server.params.embedding) {
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
return;
const auto completion_id = gen_chatcmplid();
if (!stream) {
- ctx_server.receive_cmpl_results(task_ids, [&](std::vector<server_task_result> & results) {
+ ctx_server.receive_cmpl_results(task_ids, [&](const std::vector<server_task_result> & results) {
// multitask is never support in chat completion, there is only one result
- json result_oai = format_final_response_oaicompat(data, results[0].data, completion_id);
+ json result_oai = format_final_response_oaicompat(data, results[0].data, completion_id, /*.streaming =*/ false, verbose);
res_ok(res, result_oai);
- }, [&](json error_data) {
+ }, [&](const json & error_data) {
res_error(res, error_data);
});
} else {
const auto chunked_content_provider = [task_ids, &ctx_server, completion_id](size_t, httplib::DataSink & sink) {
- ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result result) -> bool {
+ ctx_server.receive_cmpl_results_stream(task_ids, [&](const server_task_result & result) -> bool {
std::vector<json> result_array = format_partial_response_oaicompat(result.data, completion_id);
for (auto & event_data : result_array) {
if (event_data.empty()) {
}
}
return true; // ok
- }, [&](json error_data) {
+ }, [&](const json & error_data) {
server_sent_event(sink, "error", error_data);
});
static const std::string ev_done = "data: [DONE]\n\n";
for (const auto & res : results) {
responses.push_back(res.data);
}
- }, [&](json error_data) {
+ }, [&](const json & error_data) {
res_error(res, error_data);
error = true;
});
const auto handle_lora_adapters_list = [&](const httplib::Request &, httplib::Response & res) {
json result = json::array();
- for (size_t i = 0; i < ctx_server.lora_adapters.size(); ++i) {
- auto & la = ctx_server.lora_adapters[i];
+ for (size_t i = 0; i < ctx_server.loras.size(); ++i) {
+ auto & lora = ctx_server.loras[i];
result.push_back({
{"id", i},
- {"path", la.path},
- {"scale", la.scale},
+ {"path", lora.path},
+ {"scale", lora.scale},
});
}
res_ok(res, result);
const auto handle_lora_adapters_apply = [&](const httplib::Request & req, httplib::Response & res) {
const std::vector<json> body = json::parse(req.body);
- int max_idx = ctx_server.lora_adapters.size();
+ int max_idx = ctx_server.loras.size();
// clear existing value
- for (auto & la : ctx_server.lora_adapters) {
- la.scale = 0.0f;
+ for (auto & lora : ctx_server.loras) {
+ lora.scale = 0.0f;
}
// set value
int id = entry.at("id");
float scale = entry.at("scale");
if (0 <= id && id < max_idx) {
- ctx_server.lora_adapters[id].scale = scale;
+ ctx_server.loras[id].scale = scale;
} else {
throw std::runtime_error("invalid adapter id");
}
// bind HTTP listen port, run the HTTP server in a thread
if (!svr->bind_to_port(params.hostname, params.port)) {
- LOG_ERROR("couldn't bind HTTP server socket", {
- {"hostname", params.hostname},
- {"port", params.port},
- });
+ //LOG_ERROR("couldn't bind HTTP server socket", {
+ // {"hostname", params.hostname},
+ // {"port", params.port},
+ //});
+ LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, params.hostname.c_str(), params.port);
clean_up();
- LOG_ERROR("exiting due to HTTP server error", {});
return 1;
}
std::thread t([&]() { svr->listen_after_bind(); });
svr->wait_until_ready();
- LOG_INFO("HTTP server is listening", log_data);
+ //LOG_INFO("HTTP server is listening", log_data);
+ LOG_INF("%s: HTTP server is listening, hostname: %s, port: %d, http threads: %d\n", __func__, params.hostname.c_str(), params.port, params.n_threads_http);
// load the model
- LOG_INFO("loading model", log_data);
+ LOG_INF("%s: loading model\n", __func__);
+
if (!ctx_server.load_model(params)) {
clean_up();
t.join();
- LOG_ERROR("exiting due to model loading error", {});
+ LOG_ERR("%s: exiting due to model loading error\n", __func__);
return 1;
- } else {
- ctx_server.init();
- state.store(SERVER_STATE_READY);
+ }
- LOG_INFO("model loaded", {});
+ ctx_server.init();
+ state.store(SERVER_STATE_READY);
- // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
- if (params.chat_template.empty()) {
- if (!ctx_server.validate_model_chat_template()) {
- LOG_WARNING("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
- params.chat_template = "chatml";
- }
- }
+ LOG_INF("%s: model loaded\n", __func__);
- // print sample chat example to make it clear which template is used
- {
- LOG_INFO("chat template", {
- {"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)},
- {"built_in", params.chat_template.empty()},
- });
+ // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
+ if (params.chat_template.empty()) {
+ if (!ctx_server.validate_model_chat_template()) {
+ LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
+ params.chat_template = "chatml";
}
+ }
- ctx_server.queue_tasks.on_new_task(std::bind(
- &server_context::process_single_task, &ctx_server, std::placeholders::_1));
- ctx_server.queue_tasks.on_update_slots(std::bind(
- &server_context::update_slots, &ctx_server));
+ // print sample chat example to make it clear which template is used
+ LOG_INF("%s: chat template, built_in: %d, chat_example: '%s\n'", __func__, params.chat_template.empty(), llama_chat_format_example(ctx_server.model, params.chat_template).c_str());
- shutdown_handler = [&](int) {
- ctx_server.queue_tasks.terminate();
- };
- ctx_server.queue_tasks.start_loop();
- }
+ ctx_server.queue_tasks.on_new_task(std::bind(
+ &server_context::process_single_task, &ctx_server, std::placeholders::_1));
+ ctx_server.queue_tasks.on_update_slots(std::bind(
+ &server_context::update_slots, &ctx_server));
+
+ shutdown_handler = [&](int) {
+ ctx_server.queue_tasks.terminate();
+ };
+
+ LOG_INF("%s: server is listening on %s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
+
+ ctx_server.queue_tasks.start_loop();
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action;
| `PORT` | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
| `LLAMA_SERVER_BIN_PATH` | to change the server binary path, default: `../../../build/bin/llama-server` |
| `DEBUG` | "ON" to enable steps and server verbose mode `--verbose` |
-| `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format |
| `N_GPU_LAYERS` | number of model layers to offload to VRAM `-ngl --n-gpu-layers` |
### Run @bug, @wip or @wrong_usage annotated scenario
server_args.append('--verbose')
if context.lora_file:
server_args.extend(['--lora', context.lora_file])
- if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
- server_args.extend(['--log-format', "text"])
args = [str(arg) for arg in [context.server_path, *server_args]]
print(f"bench: starting server with: {' '.join(args)}")
#pragma once
-#include "llama.h"
#include "common.h"
+#include "log.h"
+#include "llama.h"
#ifndef NDEBUG
// crash the server in debug mode, otherwise send an http 500 error
#define JSON_ASSERT GGML_ASSERT
#include "json.hpp"
+#include <random>
+#include <sstream>
#include <string>
#include <vector>
-#include <sstream>
-#include <random>
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
ERROR_TYPE_NOT_SUPPORTED, // custom error
};
-extern bool server_verbose;
-extern bool server_log_json;
-
-#ifndef SERVER_VERBOSE
-#define SERVER_VERBOSE 1
-#endif
-
-#if SERVER_VERBOSE != 1
-#define LOG_VERBOSE(MSG, ...)
-#else
-#define LOG_VERBOSE(MSG, ...) \
- do \
- { \
- if (server_verbose) \
- { \
- server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \
- } \
- } while (0)
-#endif
-
-#define LOG_ERROR( MSG, ...) server_log("ERR", __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
-
-static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra);
-
template <typename T>
static T json_value(const json & body, const std::string & key, const T & default_value) {
// Fallback null to default value
try {
return body.at(key);
} catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) {
- std::stringstream ss;
- ss << "Wrong type supplied for parameter '" << key << "'. Expected '" << json(default_value).type_name() << "', using default value.";
- LOG_WARNING(ss.str().c_str(), body);
+ LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value\n", key.c_str(), json(default_value).type_name());
return default_value;
}
} else {
}
}
-static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra) {
- std::stringstream ss_tid;
- ss_tid << std::this_thread::get_id();
- json log = json{
- {"tid", ss_tid.str()},
- {"timestamp", time(nullptr)},
- };
-
- if (server_log_json) {
- log.merge_patch({
- {"level", level},
- {"function", function},
- {"line", line},
- {"msg", message},
- });
-
- if (!extra.empty()) {
- log.merge_patch(extra);
- }
-
- printf("%s\n", log.dump(-1, ' ', false, json::error_handler_t::replace).c_str());
- } else {
- char buf[1024];
- snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
-
- if (!extra.empty()) {
- log.merge_patch(extra);
- }
- std::stringstream ss;
- ss << buf << " |";
- for (const auto & el : log.items())
- {
- const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
- ss << " " << el.key() << "=" << value;
- }
-
- const std::string str = ss.str();
- printf("%.*s\n", (int)str.size(), str.data());
- }
- fflush(stdout);
-}
-
//
// chat template utils
//
chat.push_back({role, content});
}
- auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
- LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
+ const auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
+ LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
+
return formatted_chat;
}
}
static std::string gen_chatcmplid() {
- std::stringstream chatcmplid;
- chatcmplid << "chatcmpl-" << random_string();
-
- return chatcmplid.str();
+ return "chatcmpl-" + random_string();
}
//
return std::string::npos;
}
-static bool json_is_array_of_numbers(json data) {
+static bool json_is_array_of_numbers(const json & data) {
if (data.is_array()) {
for (const auto & e : data) {
if (!e.is_number()) {
return out;
}
-static bool server_sent_event(httplib::DataSink & sink, const char * event, json & data) {
+static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) {
const std::string str =
std::string(event) + ": " +
data.dump(-1, ' ', false, json::error_handler_t::replace) +
- "\n\n";
+ "\n\n"; // note: these newlines are important (not sure why though, if you know, add a comment to explain)
- LOG_VERBOSE("data stream", {
- { "to_send", str }
- });
+ LOG_DBG("data stream, to_send: %s", str.c_str());
return sink.write(str.c_str(), str.size());
}
// Params supported by OAI but unsupported by llama.cpp
static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
- for (auto & param : unsupported_params) {
+ for (const auto & param : unsupported_params) {
if (body.contains(param)) {
throw std::runtime_error("Unsupported param: " + param);
}
return llama_params;
}
-static json format_final_response_oaicompat(const json & request, json result, const std::string & completion_id, bool streaming = false) {
+static json format_final_response_oaicompat(const json & request, const json & result, const std::string & completion_id, bool streaming = false, bool verbose = false) {
bool stopped_word = result.count("stopped_word") != 0;
bool stopped_eos = json_value(result, "stopped_eos", false);
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
{"id", completion_id}
};
- if (server_verbose) {
+ // extra fields for debugging purposes
+ if (verbose) {
res["__verbose"] = result;
}
}
// return value is vector as there is one case where we might need to generate two responses
-static std::vector<json> format_partial_response_oaicompat(json result, const std::string & completion_id) {
+static std::vector<json> format_partial_response_oaicompat(const json & result, const std::string & completion_id) {
if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
return std::vector<json>({result});
}
static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
json data = json::array();
int i = 0;
- for (auto & elem : embeddings) {
+ for (const auto & elem : embeddings) {
data.push_back(json{
{"embedding", json_value(elem, "embedding", json::array())},
{"index", i++},
#include "arg.h"
#include "common.h"
+#include "log.h"
#include "llama.h"
-#include <cmath>
-#include <cstdio>
-#include <string>
#include <vector>
static void print_usage(int, char ** argv) {
- LOG_TEE("\nexample usage:\n");
- LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
- LOG_TEE("\n");
+ LOG("\nexample usage:\n");
+ LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
+ LOG("\n");
}
int main(int argc, char ** argv) {
return 1;
}
+ gpt_init();
+
// total length of the sequence including the prompt
const int n_predict = params.n_predict;
const int n_ctx = llama_n_ctx(ctx);
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
- LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
+ LOG("\n");
+ LOG_INF("%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
// make sure the KV cache is big enough to hold all the prompt and generated tokens
if (n_kv_req > n_ctx) {
- LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
- LOG_TEE("%s: either reduce n_predict or increase n_ctx\n", __func__);
+ LOG_ERR("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
+ LOG_ERR("%s: either reduce n_predict or increase n_ctx\n", __func__);
return 1;
}
// print the prompt token-by-token
- fprintf(stderr, "\n");
+ LOG("\n");
for (auto id : tokens_list) {
- fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
+ LOG("%s", llama_token_to_piece(ctx, id).c_str());
}
- fflush(stderr);
-
// create a llama_batch with size 512
// we use this object to submit token data for decoding
batch.logits[batch.n_tokens - 1] = true;
if (llama_decode(ctx, batch) != 0) {
- LOG_TEE("%s: llama_decode() failed\n", __func__);
+ LOG("%s: llama_decode() failed\n", __func__);
return 1;
}
while (n_cur <= n_predict) {
// sample the next token
{
- const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
+ const llama_token new_token_id = llama_sampler_sample(smpl, ctx, -1);
// is it an end of generation?
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
- LOG_TEE("\n");
+ LOG("\n");
break;
}
- LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
+ LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
fflush(stdout);
// prepare the next batch
// evaluate the current batch with the transformer model
if (llama_decode(ctx, batch)) {
- fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
+ LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
return 1;
}
}
- LOG_TEE("\n");
+ LOG("\n");
const auto t_main_end = ggml_time_us();
- LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
+ LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
- LOG_TEE("\n");
+ LOG("\n");
llama_perf_sampler_print(smpl);
llama_perf_context_print(ctx);
- fprintf(stderr, "\n");
+ LOG("\n");
llama_batch_free(batch);
llama_sampler_free(smpl);
#include "arg.h"
#include "common.h"
#include "sampling.h"
+#include "log.h"
#include "llama.h"
+#include <algorithm>
#include <cstdio>
+#include <cstring>
+#include <random>
+#include <set>
#include <string>
#include <vector>
-#include <set>
-#include <random>
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
return 1;
}
+ gpt_init();
+
if (params.model_draft.empty()) {
- fprintf(stderr, "%s: error: --model-draft is required\n", __func__);
+ LOG_ERR("%s: --model-draft is required\n", __func__);
return 1;
}
std::default_random_engine rng(params.sparams.seed);
std::uniform_real_distribution<> u_dist;
-#ifndef LOG_DISABLE_LOGS
- log_set_target(log_filename_generator("speculative", "log"));
- LOG_TEE("Log start\n");
- log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
-
// init llama.cpp
llama_backend_init();
llama_numa_init(params.numa);
ctx_dft = llama_init_dft.context;
const bool vocab_type_tgt = llama_vocab_type(model_tgt);
- LOG("vocab_type tgt: %d\n", vocab_type_tgt);
+ LOG_DBG("vocab_type tgt: %d\n", vocab_type_tgt);
const bool vocab_type_dft = llama_vocab_type(model_dft);
- LOG("vocab_type dft: %d\n", vocab_type_dft);
+ LOG_DBG("vocab_type dft: %d\n", vocab_type_dft);
if (vocab_type_tgt != vocab_type_dft) {
- fprintf(stderr, "%s: error: draft model vocab type must match target model to use speculation but ", __func__);
- fprintf(stderr, "vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
+ LOG_ERR("%s: draft model vocab type must match target model to use speculation but ", __func__);
+ LOG_ERR("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
return 1;
}
llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
llama_token_eos(model_tgt) != llama_token_eos(model_dft)
) {
- fprintf(stderr, "%s: error: draft model special tokens must match target model to use speculation\n", __func__);
+ LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
return 1;
}
: n_vocab_dft - n_vocab_tgt;
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
- fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__);
- fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
+ LOG_ERR("%s: draft model vocab must closely match target model to use speculation but ", __func__);
+ LOG_ERR("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
return 1;
}
const char * token_text_tgt = llama_token_get_text(model_tgt, i);
const char * token_text_dft = llama_token_get_text(model_dft, i);
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
- fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__);
- fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i,
+ LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__);
+ LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i,
llama_token_to_piece(ctx_tgt, i).c_str(),
llama_token_to_piece(ctx_dft, i).c_str());
return 1;
const int max_tokens_list_size = max_context_size - 4;
if ((int) inp.size() > max_tokens_list_size) {
- fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
+ LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
return 1;
}
- fprintf(stderr, "\n\n");
+ LOG("\n\n");
for (auto id : inp) {
- fprintf(stderr, "%s", llama_token_to_piece(ctx_tgt, id).c_str());
+ LOG("%s", llama_token_to_piece(ctx_tgt, id).c_str());
}
- fflush(stderr);
-
const int n_input = inp.size();
const auto t_enc_start = ggml_time_us();
active_seqs.insert(s);
const auto & tokens = drafts[s].tokens;
- LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str());
+ LOG_DBG("draft %d: %s\n", s, string_from(ctx_dft, tokens).c_str());
}
int i_dft = 0;
continue;
}
- LOG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
+ LOG_DBG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
float r = u_dist(rng);
llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, true };
break;
}
}
- LOG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt);
+ LOG_DBG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt);
if (r <= p_tgt / p_dft) {
s_keep = s;
accept = true;
token_str = llama_token_to_piece(ctx_tgt, token_id);
gpt_sampler_accept(smpl, token_id, true);
- LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
+ LOG_DBG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
break;
} else {
- LOG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
+ LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
drafts[s].active = false;
// calculate residual probability
if (!accept) {
// all drafted tokens were rejected
// sample from the target model
- LOG("all drafted tokens were rejected, sampling from residual distribution\n");
+ LOG_DBG("all drafted tokens were rejected, sampling from residual distribution\n");
std::vector<float> probs(dist_tgt.size);
for (size_t i = 0; i < dist_tgt.size; ++i) {
probs[i] = dist_tgt.data[i].p;
// greedy verification
// sample from the target model
- LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
+ LOG_DBG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
token_id = gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
gpt_sampler_accept(smpl, token_id, true);
- //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, smpl->prev).c_str());
-
token_str = llama_token_to_piece(ctx_tgt, token_id);
for (int s = 0; s < n_seq_dft; ++s) {
}
if (i_dft < (int) drafts[s].tokens.size() && token_id == drafts[s].tokens[i_dft]) {
- LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str());
+ LOG_DBG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str());
s_keep = s;
accept = true;
++i_dft;
if (params.use_color) {
// Color token according to its origin sequence
- printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
+ LOG("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
} else {
- printf("%s", token_str.c_str());
+ LOG("%s", token_str.c_str());
}
- fflush(stdout);
continue;
} else {
- printf("%s", token_str.c_str());
- fflush(stdout);
+ LOG("%s", token_str.c_str());
break;
}
}
}
{
- LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str());
+ LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str());
// TODO: simplify
{
- LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
+ LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
llama_kv_cache_seq_keep(ctx_dft, s_keep);
llama_kv_cache_seq_cp (ctx_dft, s_keep, 0, -1, -1);
llama_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
- // LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
+ // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
llama_decode(ctx_dft, batch_dft);
++n_past_dft;
const auto * cur_p = gpt_sampler_get_candidates(drafts[s].smpl);
for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) {
- LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
+ LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
k, s, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
}
// attempt to split the branch if the probability is high enough
for (int f = 1; f < 8; ++f) {
if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_split) {
- LOG("splitting seq %3d into %3d\n", s, n_seq_cur);
+ LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1);
llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
}
- // LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
+ // LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
llama_decode(ctx_tgt, batch_tgt);
++n_past_tgt;
}
auto t_dec_end = ggml_time_us();
- LOG_TEE("\n\n");
+ LOG("\n\n");
- LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
- LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
+ LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
+ LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
- LOG_TEE("\n");
- LOG_TEE("n_draft = %d\n", n_draft);
- LOG_TEE("n_predict = %d\n", n_predict);
- LOG_TEE("n_drafted = %d\n", n_drafted);
- LOG_TEE("n_accept = %d\n", n_accept);
- LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
+ LOG_INF("\n");
+ LOG_INF("n_draft = %d\n", n_draft);
+ LOG_INF("n_predict = %d\n", n_predict);
+ LOG_INF("n_drafted = %d\n", n_drafted);
+ LOG_INF("n_accept = %d\n", n_accept);
+ LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
- LOG_TEE("\ndraft:\n\n");
+ LOG_INF("\n");
+ LOG_INF("draft:\n\n");
// TODO: print sampling/grammar timings for all drafts
llama_perf_context_print(ctx_dft);
- LOG_TEE("\ntarget:\n\n");
+ LOG_INF("\n");
+ LOG_INF("target:\n\n");
gpt_perf_print(ctx_tgt, smpl);
gpt_sampler_free(smpl);
llama_backend_free();
- fprintf(stderr, "\n\n");
+ LOG("\n\n");
return 0;
}
#include "common.h"
+//#include "log.h" // TODO: start using log.h
#include "llama.h"
-#include <cmath>
#include <cstdio>
+#include <cstring>
#include <fstream>
#include <string>
#include <vector>
+#include <iostream> // TODO: remove me
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#include <shellapi.h> // For CommandLineToArgvW
#endif
-static void print_usage_information(const char * argv0, FILE * stream) {
- fprintf(stream, "usage: %s [options]\n\n", argv0);
- fprintf(stream, "The tokenize program tokenizes a prompt using a given model,\n");
- fprintf(stream, "and prints the resulting tokens to standard output.\n\n");
- fprintf(stream, "It needs a model file, a prompt, and optionally other flags\n");
- fprintf(stream, "to control the behavior of the tokenizer.\n\n");
- fprintf(stream, " The possible options are:\n");
- fprintf(stream, "\n");
- fprintf(stream, " -h, --help print this help and exit\n");
- fprintf(stream, " -m MODEL_PATH, --model MODEL_PATH path to model.\n");
- fprintf(stream, " --ids if given, only print numerical token IDs, and not token strings.\n");
- fprintf(stream, " The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
- fprintf(stream, " -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
- fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
- fprintf(stream, " --stdin read prompt from standard input.\n");
- fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
- fprintf(stream, " --no-parse-special do not parse control tokens.\n");
- fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n");
- fprintf(stream, " --show-count print the total number of tokens.\n");
+static void print_usage_information(const char * argv0) {
+ printf("usage: %s [options]\n\n", argv0);
+ printf("The tokenize program tokenizes a prompt using a given model,\n");
+ printf("and prints the resulting tokens to standard output.\n\n");
+ printf("It needs a model file, a prompt, and optionally other flags\n");
+ printf("to control the behavior of the tokenizer.\n\n");
+ printf(" The possible options are:\n");
+ printf("\n");
+ printf(" -h, --help print this help and exit\n");
+ printf(" -m MODEL_PATH, --model MODEL_PATH path to model.\n");
+ printf(" --ids if given, only print numerical token IDs, and not token strings.\n");
+ printf(" The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
+ printf(" -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
+ printf(" -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
+ printf(" --stdin read prompt from standard input.\n");
+ printf(" --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
+ printf(" --no-parse-special do not parse control tokens.\n");
+ printf(" --log-disable disable logs. Makes stderr quiet when loading the model.\n");
+ printf(" --show-count print the total number of tokens.\n");
}
static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
const int argc = argv.size();
if (argc <= 1) {
- print_usage_information(argv[0].c_str(), stderr);
+ print_usage_information(argv[0].c_str());
return 1;
}
for (; iarg < argc; ++iarg) {
std::string arg{argv[iarg]};
if (arg == "-h" || arg == "--help") {
- print_usage_information(argv[0].c_str(), stdout);
+ print_usage_information(argv[0].c_str());
return 0;
}
else if (arg == "--ids") {
// Start actually doing the tokenizing stuff.
//////
-#ifdef LOG_DISABLE_LOGS
- disable_logging = true;
-#endif
-
if (disable_logging) {
llama_log_set(llama_log_callback_null, NULL);
}
};
enum ggml_log_level {
- GGML_LOG_LEVEL_ERROR = 2,
- GGML_LOG_LEVEL_WARN = 3,
- GGML_LOG_LEVEL_INFO = 4,
- GGML_LOG_LEVEL_DEBUG = 5
+ GGML_LOG_LEVEL_NONE = 0,
+ GGML_LOG_LEVEL_INFO = 1,
+ GGML_LOG_LEVEL_WARN = 2,
+ GGML_LOG_LEVEL_ERROR = 3,
+ GGML_LOG_LEVEL_DEBUG = 4,
};
enum ggml_tensor_flag {
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#ifdef GGML_METAL_NDEBUG
+#define GGML_METAL_LOG(...)
#define GGML_METAL_LOG_INFO(...)
#define GGML_METAL_LOG_WARN(...)
#define GGML_METAL_LOG_ERROR(...)
#else
+#define GGML_METAL_LOG(...) ggml_metal_log(GGML_LOG_LEVEL_NONE, __VA_ARGS__)
#define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
#define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
#define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
+#define GGML_METAL_LOG_DEBUG(...) ggml_metal_log(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
#endif
#define UNUSED(x) (void)(x)
#ifndef GGML_METAL_NDEBUG
#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
if (@available(macOS 10.12, iOS 16.0, *)) {
- GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)",
+ GGML_METAL_LOG_DEBUG("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n",
__func__,
size_aligned / 1024.0 / 1024.0,
device.currentAllocatedSize / 1024.0 / 1024.0,
if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
- } else {
- GGML_METAL_LOG_INFO("\n");
}
} else {
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n",
void llama_log_internal (ggml_log_level level, const char * format, ...);
void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
+#define LLAMA_LOG(...) llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
unsigned percentage = (unsigned) (100 * progress);
while (percentage > *cur_percentage_p) {
*cur_percentage_p = percentage;
- LLAMA_LOG_INFO(".");
+ LLAMA_LOG(".");
if (percentage >= 100) {
- LLAMA_LOG_INFO("\n");
+ LLAMA_LOG("\n");
}
}
return true;
if (len < 128) {
g_state.log_callback(level, buffer, g_state.log_callback_user_data);
} else {
- char* buffer2 = new char[len+1];
- vsnprintf(buffer2, len+1, format, args_copy);
+ char * buffer2 = new char[len + 1];
+ vsnprintf(buffer2, len + 1, format, args_copy);
buffer2[len] = 0;
g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
delete[] buffer2;
#llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
# llama_target_and_test(test-double-float.cpp) # SLOW
+llama_target_and_test(test-log.cpp)
llama_target_and_test(test-arg-parser.cpp)
llama_target_and_test(test-quantize-fns.cpp)
llama_target_and_test(test-quantize-perf.cpp)
argv = {"binary_name", "--verbose"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
- assert(params.verbosity == 1);
+ assert(params.verbosity > 1);
argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
--- /dev/null
+#include "log.h"
+
+#include <cstdlib>
+#include <thread>
+
+int main() {
+ const int n_thread = 8;
+
+ std::thread threads[n_thread];
+ for (int i = 0; i < n_thread; i++) {
+ threads[i] = std::thread([i]() {
+ const int n_msg = 1000;
+
+ for (int j = 0; j < n_msg; j++) {
+ const int log_type = std::rand() % 4;
+
+ switch (log_type) {
+ case 0: LOG_INF("Thread %d: %d\n", i, j); break;
+ case 1: LOG_WRN("Thread %d: %d\n", i, j); break;
+ case 2: LOG_ERR("Thread %d: %d\n", i, j); break;
+ case 3: LOG_DBG("Thread %d: %d\n", i, j); break;
+ default:
+ break;
+ }
+
+ if (rand () % 10 < 5) {
+ gpt_log_set_timestamps(gpt_log_main(), rand() % 2);
+ gpt_log_set_prefix (gpt_log_main(), rand() % 2);
+ }
+ }
+ });
+ }
+
+ for (int i = 0; i < n_thread; i++) {
+ threads[i].join();
+ }
+
+ return 0;
+}