From: Georgi Gerganov Date: Thu, 24 Apr 2025 13:00:10 +0000 (+0300) Subject: cmake : do not include ./src as public for libllama (#13062) X-Git-Tag: upstream/0.0.5185~5 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=13b4548877326fdabee3e831b8cfd65d9844383c;p=pkg%2Fggml%2Fsources%2Fllama.cpp cmake : do not include ./src as public for libllama (#13062) * cmake : do not include ./src as public for libllama ggml-ci * cmake : rework tests ggml-ci * llguidance : remove unicode include ggml-ci * cmake : make c++17 private ggml-ci --- diff --git a/common/arg.cpp b/common/arg.cpp index 9cbf9857..0657553e 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -994,7 +994,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) { "llama-embedding", "llama-eval-callback", "llama-export-lora", - "llama-gbnf-validator", "llama-gen-docs", "llama-gguf", "llama-gguf-hash", @@ -1014,7 +1013,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) { "llama-perplexity", "llama-q8dot", "llama-quantize", - "llama-quantize-stats", "llama-qwen2vl-cli", "llama-retrieval", "llama-run", diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 66cfab2c..37476f90 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -21,11 +21,6 @@ else() add_subdirectory(embedding) add_subdirectory(eval-callback) - if (NOT WIN32) - # disabled on Windows because it uses internal functions not exported with LLAMA_API - add_subdirectory(gbnf-validator) - endif() - add_subdirectory(gguf-hash) add_subdirectory(gguf-split) add_subdirectory(gguf) @@ -58,10 +53,6 @@ else() add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(cvector-generator) add_subdirectory(export-lora) - if (NOT WIN32) - # disabled on Windows because it uses internal functions not exported with LLAMA_API - add_subdirectory(quantize-stats) - endif() add_subdirectory(llava) if (GGML_RPC) add_subdirectory(rpc) diff --git a/examples/gbnf-validator/CMakeLists.txt b/examples/gbnf-validator/CMakeLists.txt deleted file mode 100644 index d2cb524c..00000000 --- a/examples/gbnf-validator/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-gbnf-validator) -add_executable(${TARGET} gbnf-validator.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/gbnf-validator/gbnf-validator.cpp b/examples/gbnf-validator/gbnf-validator.cpp deleted file mode 100644 index a610e6a0..00000000 --- a/examples/gbnf-validator/gbnf-validator.cpp +++ /dev/null @@ -1,109 +0,0 @@ -#include "unicode.h" -#include "llama-grammar.h" - -#include -#include -#include -#include -#include -#include - -static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) { - const auto cpts = unicode_cpts_from_utf8(input_str); - - auto & stacks_cur = llama_grammar_get_stacks(grammar); - - size_t pos = 0; - for (const auto & cpt : cpts) { - llama_grammar_accept(grammar, cpt); - - if (stacks_cur.empty()) { - error_pos = pos; - error_msg = "Unexpected character '" + unicode_cpt_to_utf8(cpt) + "'"; - return false; - } - ++pos; - } - - for (const auto & stack : stacks_cur) { - if (stack.empty()) { - return true; - } - } - - error_pos = pos; - error_msg = "Unexpected end of input"; - return false; -} - -static void print_error_message(const std::string & input_str, size_t error_pos, const std::string & error_msg) { - fprintf(stdout, "Input string is invalid according to the grammar.\n"); - fprintf(stdout, "Error: %s at position %zu\n", error_msg.c_str(), error_pos); - fprintf(stdout, "\n"); - fprintf(stdout, "Input string:\n"); - fprintf(stdout, "%s", input_str.substr(0, error_pos).c_str()); - if (error_pos < input_str.size()) { - fprintf(stdout, "\033[1;31m%c", input_str[error_pos]); - if (error_pos+1 < input_str.size()) { - fprintf(stdout, "\033[0;31m%s", input_str.substr(error_pos+1).c_str()); - } - fprintf(stdout, "\033[0m\n"); - } -} - -int main(int argc, char** argv) { - if (argc != 3) { - fprintf(stdout, "Usage: %s \n", argv[0]); - return 1; - } - - const std::string grammar_filename = argv[1]; - const std::string input_filename = argv[2]; - - // Read the GBNF grammar file - FILE* grammar_file = fopen(grammar_filename.c_str(), "r"); - if (!grammar_file) { - fprintf(stdout, "Failed to open grammar file: %s\n", grammar_filename.c_str()); - return 1; - } - - std::string grammar_str; - { - std::ifstream grammar_file(grammar_filename); - GGML_ASSERT(grammar_file.is_open() && "Failed to open grammar file"); - std::stringstream buffer; - buffer << grammar_file.rdbuf(); - grammar_str = buffer.str(); - } - - llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root", false, nullptr, 0, nullptr, 0); - if (grammar == nullptr) { - fprintf(stdout, "Failed to initialize llama_grammar\n"); - return 1; - } - // Read the input file - std::string input_str; - { - std::ifstream input_file(input_filename); - GGML_ASSERT(input_file.is_open() && "Failed to open input file"); - std::stringstream buffer; - buffer << input_file.rdbuf(); - input_str = buffer.str(); - } - - // Validate the input string against the grammar - size_t error_pos; - std::string error_msg; - bool is_valid = llama_grammar_validate(grammar, input_str, error_pos, error_msg); - - if (is_valid) { - fprintf(stdout, "Input string is valid according to the grammar.\n"); - } else { - print_error_message(input_str, error_pos, error_msg); - } - - // Clean up - llama_grammar_free_impl(grammar); - - return 0; -} diff --git a/examples/quantize-stats/CMakeLists.txt b/examples/quantize-stats/CMakeLists.txt deleted file mode 100644 index 9a3a0d3c..00000000 --- a/examples/quantize-stats/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -set(TARGET llama-quantize-stats) -add_executable(${TARGET} quantize-stats.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) -target_include_directories(${TARGET} PRIVATE ../../common) -target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp deleted file mode 100644 index dd07ab9b..00000000 --- a/examples/quantize-stats/quantize-stats.cpp +++ /dev/null @@ -1,422 +0,0 @@ -#include "ggml.h" -#include "llama.h" -#include "llama-model.h" -#include "common.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -struct quantize_stats_params { - std::string model = DEFAULT_MODEL_PATH; - bool verbose = false; - bool per_layer_stats = false; - bool print_histogram = false; - bool reference = false; - std::vector include_layers; - std::vector exclude_layers; - std::vector include_types; -}; - -constexpr size_t HISTOGRAM_BUCKETS = 150; -constexpr double HISTOGRAM_RANGE = 0.03; - -struct error_stats { - size_t num_samples; - double total_error; - double max_error; - uint64_t error_histogram[HISTOGRAM_BUCKETS]; -}; - -static void quantize_stats_print_usage(int /*argc*/, char ** argv) { - quantize_stats_params params; - fprintf(stderr, "usage: %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " -m FNAME, --model FNAME\n"); - fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); - fprintf(stderr, " -r, --reference\n"); - fprintf(stderr, " use reference implementation (default: false)\n"); - fprintf(stderr, " -v, --verbose\n"); - fprintf(stderr, " verbose output (default: false)\n"); - fprintf(stderr, " -p, --per-layer-stats\n"); - fprintf(stderr, " print stats per layer (default: false)\n"); - fprintf(stderr, " --histogram\n"); - fprintf(stderr, " print error histogram (default: false)\n"); - fprintf(stderr, " -l LAYER, --include-layer LAYER\n"); - fprintf(stderr, " only test layers matching pattern\n"); - fprintf(stderr, " -L LAYER, --exclude-layer LAYER\n"); - fprintf(stderr, " exclude layers matching pattern\n"); - fprintf(stderr, " -t TYPE, --type TYPE\n"); - fprintf(stderr, " only test given type (q4_0, q4_1)\n"); - fprintf(stderr, "\n"); -} - -// Check if a layer is included/excluded by command line -static bool layer_included(const quantize_stats_params & params, const std::string & layer) { - for (const auto& excluded : params.exclude_layers) { - if (std::regex_search(layer, std::regex(excluded))) { - return false; - } - } - for (const auto& included : params.include_layers) { - if (std::regex_search(layer, std::regex(included))) { - return true; - } - } - return params.include_layers.empty(); -} - -// Update error statistics given vectors with the before/after result of quantization -static void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) { - for (int64_t i = 0; i < nelements; i++) { - double diff = input[i] - output[i]; - stats.total_error += diff * diff; - stats.max_error = fmax(fabs(diff), stats.max_error); - stats.error_histogram[std::max(std::min((size_t) floor(fabs(diff) / HISTOGRAM_RANGE * HISTOGRAM_BUCKETS), HISTOGRAM_BUCKETS-1), (size_t) 0)]++; - } - stats.num_samples += nelements; -} - -static void combine_error_stats(error_stats & into, const error_stats & from) { - into.num_samples += from.num_samples; - into.total_error += from.total_error; - if (from.max_error > into.max_error) into.max_error = from.max_error; - for (size_t i=0; i= sum*quantile) { - return (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS; - } - } - return INFINITY; -} - -static void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) { - double rmse = sqrt(stats.total_error / (double) stats.num_samples); - double median = find_quantile(stats, .5); - double pct95 = find_quantile(stats, .95); - printf("%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median); - if (print_histogram) { - printf("Error distribution:\n"); - for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) { - double lower = i * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS; - double upper = (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS; - if (i == HISTOGRAM_BUCKETS -1) upper = INFINITY; - printf("[%3.4f, %3.4f): %11" PRIu64 "\n", lower, upper, stats.error_histogram[i]); - } - } -} - -// copied from ggml.h - verify that we can access this as a flat array -static bool tensor_is_contiguous(const struct ggml_tensor * tensor) { - static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); - - return - tensor->nb[0] == ggml_type_size(tensor->type) && - tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) && - tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && - tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; -} - -static void test_roundtrip_on_chunk( - const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference, - float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats -) { - if (layer->type == GGML_TYPE_F16) { - for (int i = 0; i < chunk_size; i++) { - input_scratch[i] = ggml_get_f32_1d(layer, i + offset); - } - } else { - input_scratch = ggml_get_data_f32(layer) + offset; - } - - if (use_reference) { - qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size); - } else { - qfns_cpu.from_float(input_scratch, quantized_scratch, chunk_size); - } - qfns.to_float(quantized_scratch, output_scratch, chunk_size); - - update_error_stats(chunk_size, input_scratch, output_scratch, stats); -} - - -// Run quantization function for a single layer and update error stats -static void test_roundtrip_on_layer( - std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference, - const ggml_tensor * layer, std::vector & input_scratch, std::vector & quantized_scratch, - std::vector & output_scratch, error_stats & total_error, int max_thread = 0 -) { - assert(tensor_is_contiguous(layer)); - error_stats layer_error {}; - uint64_t nelements = ggml_nelements(layer); - - float* input_scratch_ptr = nullptr; - if (layer->type == GGML_TYPE_F16) { - if (input_scratch.size() < nelements) input_scratch.resize(nelements); - input_scratch_ptr = input_scratch.data(); - } - if (quantized_scratch.size() < 4*nelements) quantized_scratch.resize(4*nelements); - if (output_scratch.size() < nelements) output_scratch.resize(nelements); - - if (max_thread < 1) max_thread = std::thread::hardware_concurrency(); - int chunk_size = 32*512; - int num_chunks = (nelements + chunk_size - 1)/chunk_size; - - if (num_chunks < 2 || max_thread < 2) { - test_roundtrip_on_chunk(layer, 0, nelements, qfns, qfns_cpu, use_reference, input_scratch_ptr, quantized_scratch.data(), - output_scratch.data(), print_layer_stats ? layer_error : total_error); - } else { - auto & stats = print_layer_stats ? layer_error : total_error; - std::mutex mutex; - uint64_t counter = 0; - auto compute = [&mutex, &counter, &stats, &qfns, &qfns_cpu, nelements, layer, use_reference, input_scratch_ptr, - &quantized_scratch, &output_scratch, chunk_size] () { - error_stats local_stats {}; - while (true) { - std::unique_lock lock(mutex); - uint64_t offset = counter; counter += chunk_size; - if (offset >= nelements) { - combine_error_stats(stats, local_stats); - break; - } - lock.unlock(); - uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset; - test_roundtrip_on_chunk(layer, offset, chunk, qfns, qfns_cpu, use_reference, input_scratch_ptr + offset, - quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats); - } - }; - int nthread = std::min(num_chunks, max_thread); - std::vector workers(nthread-1); - for (auto& w : workers) w = std::thread(compute); - compute(); - for (auto& w : workers) w.join(); - } - - if (print_layer_stats) { - print_error_stats(name, layer_error, false); - combine_error_stats(total_error, layer_error); - } -} - -int main(int argc, char ** argv) { - ggml_time_init(); - - quantize_stats_params params; - - // read command line - - int max_thread = 0; - bool invalid_param = false; - std::string arg; - for (int i = 1; i < argc; i++) { - arg = argv[i]; - - if (arg == "-h" || arg == "--help") { - quantize_stats_print_usage(argc, argv); - exit(0); - } else if (arg == "-r" || arg == "--reference") { - params.reference = true; - } else if (arg == "-v") { - params.verbose = true; - } else if (arg == "-p" || arg == "--per-layer-stats") { - params.per_layer_stats = true; - } else if (arg == "--histogram") { - params.print_histogram = true; - } else if (arg == "-m" || arg == "--model") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.model = argv[i]; - } else if (arg == "-l" || arg == "--include-layer") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.include_layers.emplace_back(argv[i]); - } else if (arg == "-L" || arg == "--exclude-layer") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.exclude_layers.emplace_back(argv[i]); - } else if (arg == "-t" || arg == "--type") { - if (++i >= argc) { - invalid_param = true; - break; - } - int j; - for (j = 0; j < GGML_TYPE_COUNT; ++j) { - const auto * name = ggml_type_name((ggml_type) j); - if (name && strcmp(argv[i], name) == 0) break; - } - if (j < GGML_TYPE_COUNT) { - params.include_types.push_back((ggml_type) j); - } else { - fprintf(stderr, "error: %s not in list of types\n", argv[i]); - invalid_param = true; - } - } else if (arg == "-n" || arg == "--num-threads") { - if (++i >= argc) { - invalid_param = true; - break; - } - max_thread = atoi(argv[i]); - } else { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - quantize_stats_print_usage(argc, argv); - return 1; - } - } - if (invalid_param) { - fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); - quantize_stats_print_usage(argc, argv); - return 1; - } - - print_build_info(); - - // load the model - fprintf(stderr, "Loading model\n"); - - const int64_t t_main_start_us = ggml_time_us(); - llama_model * model; - llama_context * ctx; - - { - auto mparams = llama_model_default_params(); - mparams.use_mlock = false; - - model = llama_model_load_from_file(params.model.c_str(), mparams); - - if (model == NULL) { - fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); - return 1; - } - - auto cparams = llama_context_default_params(); - cparams.n_ctx = 256; - - ctx = llama_init_from_model(model, cparams); - - if (ctx == NULL) { - fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); - llama_model_free(model); - return 1; - } - } - - const auto & tensors = llama_internal_get_tensor_map(model); - - // check layer tensors - int included_layers = 0; - int64_t max_nelements = 0; - bool is_f16 = false; - for (const auto & kv_tensor : tensors) { - if (!layer_included(params, kv_tensor.first)) { - continue; - } - if (params.verbose) { - printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), ggml_type_name(kv_tensor.second->type), ggml_nelements(kv_tensor.second)); - } - if (kv_tensor.second->type == GGML_TYPE_F16) { - is_f16 = true; - } else if (kv_tensor.second->type != GGML_TYPE_F32) { - fprintf(stderr, "%s: error: Quantization should be tested with a float model, " - "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type); - llama_free(ctx); - llama_model_free(model); - return 1; - } - included_layers++; - max_nelements = std::max(max_nelements, ggml_nelements(kv_tensor.second)); - } - - if (is_f16) { - printf("note: source model is f16\n"); - } - printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements); - // allocate scratch space - std::vector input_scratch; - std::vector quantized_scratch; - std::vector output_scratch; - - // loop throught quantization types - for (int i = 0; i < GGML_TYPE_COUNT; i++) { - const ggml_type type = (ggml_type) i; - if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) { - continue; - } - const auto * qfns = ggml_get_type_traits(type); - const auto * qfns_cpu = ggml_get_type_traits_cpu(type); - if (qfns_cpu->from_float && qfns->to_float) { - if (params.verbose) { - printf("testing %s ...\n", ggml_type_name(type)); - } - - ggml_quantize_init(type); - - error_stats global_stats {}; - - for (const auto & kv_tensor : tensors) { - if (!layer_included(params, kv_tensor.first)) { - continue; - } - if (params.verbose) { - printf(" %s ...\n", kv_tensor.first.c_str()); - } - std::string layer_name { ggml_type_name(type) }; - layer_name += "::" + kv_tensor.first; - test_roundtrip_on_layer( - layer_name, - params.per_layer_stats, - *qfns, *qfns_cpu, - params.reference, - kv_tensor.second, - input_scratch, - quantized_scratch, - output_scratch, - global_stats, - max_thread - ); - } - - print_error_stats(ggml_type_name(type), global_stats, params.print_histogram); - } - } - - - llama_free(ctx); - llama_model_free(model); - // report timing - { - const int64_t t_main_end_us = ggml_time_us(); - - printf("\n"); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0); - } - - return 0; -} diff --git a/grammars/README.md b/grammars/README.md index 935213f5..5aa12acc 100644 --- a/grammars/README.md +++ b/grammars/README.md @@ -112,7 +112,7 @@ You can use GBNF grammars: - In [llama-server](../examples/server)'s completion endpoints, passed as the `grammar` body field - In [llama-cli](../examples/main), passed as the `--grammar` & `--grammar-file` flags -- With [llama-gbnf-validator](../examples/gbnf-validator) tool, to test them against strings. +- With [test-gbnf-validator](../tests/test-gbnf-validator.cpp), to test them against strings. ## JSON Schemas → GBNF diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9f7ab13f..1cd316b0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -32,8 +32,9 @@ add_library(llama unicode.h ) -target_include_directories(llama PUBLIC . ../include) -target_compile_features (llama PUBLIC cxx_std_17) # don't bump +target_include_directories(llama PRIVATE .) +target_include_directories(llama PUBLIC ../include) +target_compile_features (llama PRIVATE cxx_std_17) # don't bump target_link_libraries(llama PUBLIC ggml) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2bb21070..ae682752 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,5 +1,17 @@ llama_add_compile_flags() +function(llama_build source) + if (DEFINED LLAMA_TEST_NAME) + set(TEST_TARGET ${LLAMA_TEST_NAME}) + else() + get_filename_component(TEST_TARGET ${source} NAME_WE) + endif() + + add_executable(${TEST_TARGET} ${source}) + target_link_libraries(${TEST_TARGET} PRIVATE common) + install(TARGETS ${TEST_TARGET} RUNTIME) +endfunction() + function(llama_test target) include(CMakeParseArguments) set(options) @@ -36,7 +48,7 @@ endfunction() # - LABEL: label for the test (defaults to main) # - ARGS: arguments to pass to the test executable # - WORKING_DIRECTORY -function(llama_target_and_test source) +function(llama_build_and_test source) include(CMakeParseArguments) set(options) set(oneValueArgs NAME LABEL WORKING_DIRECTORY) @@ -58,6 +70,7 @@ function(llama_target_and_test source) add_executable(${TEST_TARGET} ${source} get-model.cpp) install(TARGETS ${TEST_TARGET} RUNTIME) target_link_libraries(${TEST_TARGET} PRIVATE common) + add_test( NAME ${TEST_TARGET} WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY} @@ -68,9 +81,7 @@ function(llama_target_and_test source) endfunction() # build test-tokenizer-0 target once and add many tests -add_executable(test-tokenizer-0 test-tokenizer-0.cpp) -target_link_libraries(test-tokenizer-0 PRIVATE common) -install(TARGETS test-tokenizer-0 RUNTIME) +llama_build(test-tokenizer-0.cpp) llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf) @@ -87,27 +98,27 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf) if (LLAMA_LLGUIDANCE) - llama_target_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf) + llama_build_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf) endif () if (NOT WIN32) # these tests are disabled on Windows because they use internal functions not exported with LLAMA_API - llama_target_and_test(test-sampling.cpp) - llama_target_and_test(test-grammar-parser.cpp) - llama_target_and_test(test-grammar-integration.cpp) - llama_target_and_test(test-llama-grammar.cpp) - llama_target_and_test(test-chat.cpp) + llama_build_and_test(test-sampling.cpp) + llama_build_and_test(test-grammar-parser.cpp) + llama_build_and_test(test-grammar-integration.cpp) + llama_build_and_test(test-llama-grammar.cpp) + llama_build_and_test(test-chat.cpp) # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8 if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") - llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..) + llama_build_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..) target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server) endif() + llama_build(test-quantize-stats.cpp) + llama_build(test-gbnf-validator.cpp) # build test-tokenizer-1-bpe target once and add many tests - add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp) - target_link_libraries(test-tokenizer-1-bpe PRIVATE common) - install(TARGETS test-tokenizer-1-bpe RUNTIME) + llama_build(test-tokenizer-1-bpe.cpp) # TODO: disabled due to slowness #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf) @@ -120,37 +131,35 @@ if (NOT WIN32) #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf) # build test-tokenizer-1-spm target once and add many tests - add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp) - target_link_libraries(test-tokenizer-1-spm PRIVATE common) - install(TARGETS test-tokenizer-1-spm RUNTIME) + llama_build(test-tokenizer-1-spm.cpp) llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf) #llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf) - # llama_target_and_test(test-double-float.cpp) # SLOW + # llama_build_and_test(test-double-float.cpp) # SLOW endif() -llama_target_and_test(test-log.cpp) -llama_target_and_test(test-chat-template.cpp) +llama_build_and_test(test-log.cpp) +llama_build_and_test(test-chat-template.cpp) # this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135) if (NOT WIN32) - llama_target_and_test(test-arg-parser.cpp) + llama_build_and_test(test-arg-parser.cpp) endif() -# llama_target_and_test(test-opt.cpp) # SLOW -llama_target_and_test(test-gguf.cpp) -llama_target_and_test(test-backend-ops.cpp) +# llama_build_and_test(test-opt.cpp) # SLOW +llama_build_and_test(test-gguf.cpp) +llama_build_and_test(test-backend-ops.cpp) -llama_target_and_test(test-model-load-cancel.cpp LABEL "model") -llama_target_and_test(test-autorelease.cpp LABEL "model") +llama_build_and_test(test-model-load-cancel.cpp LABEL "model") +llama_build_and_test(test-autorelease.cpp LABEL "model") if (NOT GGML_BACKEND_DL) # these tests use the backends directly and cannot be built with dynamic loading - llama_target_and_test(test-barrier.cpp) - llama_target_and_test(test-quantize-fns.cpp) - llama_target_and_test(test-quantize-perf.cpp) - llama_target_and_test(test-rope.cpp) + llama_build_and_test(test-barrier.cpp) + llama_build_and_test(test-quantize-fns.cpp) + llama_build_and_test(test-quantize-perf.cpp) + llama_build_and_test(test-rope.cpp) endif() diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index a0bf6aff..fa7aed82 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -11,8 +11,9 @@ #include #include "chat.h" -#include "llama-grammar.h" -#include "unicode.h" + +#include "../src/unicode.h" +#include "../src/llama-grammar.h" using json = nlohmann::ordered_json; diff --git a/tests/test-gbnf-validator.cpp b/tests/test-gbnf-validator.cpp new file mode 100644 index 00000000..6547eec3 --- /dev/null +++ b/tests/test-gbnf-validator.cpp @@ -0,0 +1,109 @@ +#include "../src/unicode.h" +#include "../src/llama-grammar.h" + +#include +#include +#include +#include +#include +#include + +static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) { + const auto cpts = unicode_cpts_from_utf8(input_str); + + auto & stacks_cur = llama_grammar_get_stacks(grammar); + + size_t pos = 0; + for (const auto & cpt : cpts) { + llama_grammar_accept(grammar, cpt); + + if (stacks_cur.empty()) { + error_pos = pos; + error_msg = "Unexpected character '" + unicode_cpt_to_utf8(cpt) + "'"; + return false; + } + ++pos; + } + + for (const auto & stack : stacks_cur) { + if (stack.empty()) { + return true; + } + } + + error_pos = pos; + error_msg = "Unexpected end of input"; + return false; +} + +static void print_error_message(const std::string & input_str, size_t error_pos, const std::string & error_msg) { + fprintf(stdout, "Input string is invalid according to the grammar.\n"); + fprintf(stdout, "Error: %s at position %zu\n", error_msg.c_str(), error_pos); + fprintf(stdout, "\n"); + fprintf(stdout, "Input string:\n"); + fprintf(stdout, "%s", input_str.substr(0, error_pos).c_str()); + if (error_pos < input_str.size()) { + fprintf(stdout, "\033[1;31m%c", input_str[error_pos]); + if (error_pos+1 < input_str.size()) { + fprintf(stdout, "\033[0;31m%s", input_str.substr(error_pos+1).c_str()); + } + fprintf(stdout, "\033[0m\n"); + } +} + +int main(int argc, char** argv) { + if (argc != 3) { + fprintf(stdout, "Usage: %s \n", argv[0]); + return 1; + } + + const std::string grammar_filename = argv[1]; + const std::string input_filename = argv[2]; + + // Read the GBNF grammar file + FILE* grammar_file = fopen(grammar_filename.c_str(), "r"); + if (!grammar_file) { + fprintf(stdout, "Failed to open grammar file: %s\n", grammar_filename.c_str()); + return 1; + } + + std::string grammar_str; + { + std::ifstream grammar_file(grammar_filename); + GGML_ASSERT(grammar_file.is_open() && "Failed to open grammar file"); + std::stringstream buffer; + buffer << grammar_file.rdbuf(); + grammar_str = buffer.str(); + } + + llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root", false, nullptr, 0, nullptr, 0); + if (grammar == nullptr) { + fprintf(stdout, "Failed to initialize llama_grammar\n"); + return 1; + } + // Read the input file + std::string input_str; + { + std::ifstream input_file(input_filename); + GGML_ASSERT(input_file.is_open() && "Failed to open input file"); + std::stringstream buffer; + buffer << input_file.rdbuf(); + input_str = buffer.str(); + } + + // Validate the input string against the grammar + size_t error_pos; + std::string error_msg; + bool is_valid = llama_grammar_validate(grammar, input_str, error_pos, error_msg); + + if (is_valid) { + fprintf(stdout, "Input string is valid according to the grammar.\n"); + } else { + print_error_message(input_str, error_pos, error_msg); + } + + // Clean up + llama_grammar_free_impl(grammar); + + return 0; +} diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp index 89060864..8988c347 100644 --- a/tests/test-grammar-integration.cpp +++ b/tests/test-grammar-integration.cpp @@ -2,10 +2,11 @@ #undef NDEBUG #endif -#include "unicode.h" -#include "llama-grammar.h" #include "json-schema-to-grammar.h" +#include "../src/unicode.h" +#include "../src/llama-grammar.h" + #include #include #include diff --git a/tests/test-grammar-llguidance.cpp b/tests/test-grammar-llguidance.cpp index 3c19220e..566b039a 100644 --- a/tests/test-grammar-llguidance.cpp +++ b/tests/test-grammar-llguidance.cpp @@ -2,7 +2,6 @@ # undef NDEBUG #endif -#include "unicode.h" #include "sampling.h" #include @@ -84,7 +83,7 @@ static void test(const std::string & test_desc, const std::string & grammar_str, fprintf(stderr, "\n NOTE: Debug grammar file generated. To analyze this failure in detail, run the following " - "command: ./llama-gbnf-validator test-grammar-integration.grammar.gbnf " + "command: ./test-gbnf-validator test-grammar-integration.grammar.gbnf " "test-grammar-integration.string.txt\n\n"); } else { fprintf(stdout, "✅︎\n"); diff --git a/tests/test-grammar-parser.cpp b/tests/test-grammar-parser.cpp index 259172d9..67821a2d 100644 --- a/tests/test-grammar-parser.cpp +++ b/tests/test-grammar-parser.cpp @@ -3,7 +3,9 @@ #endif #include "llama.h" -#include "llama-grammar.h" + +// TODO: shold not include libllama sources +#include "../src/llama-grammar.h" #include diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index 4d78e914..e35134f3 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -4,7 +4,7 @@ #include "json-schema-to-grammar.h" -#include "llama-grammar.h" +#include "../src/llama-grammar.h" #include #include diff --git a/tests/test-llama-grammar.cpp b/tests/test-llama-grammar.cpp index e2129206..cc198f3e 100644 --- a/tests/test-llama-grammar.cpp +++ b/tests/test-llama-grammar.cpp @@ -3,7 +3,8 @@ #endif #include "llama.h" -#include "llama-grammar.h" + +#include "../src/llama-grammar.h" #include #include diff --git a/tests/test-quantize-stats.cpp b/tests/test-quantize-stats.cpp new file mode 100644 index 00000000..db010591 --- /dev/null +++ b/tests/test-quantize-stats.cpp @@ -0,0 +1,423 @@ +#include "ggml.h" +#include "llama.h" +#include "common.h" + +#include "../src/llama-model.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +struct quantize_stats_params { + std::string model = DEFAULT_MODEL_PATH; + bool verbose = false; + bool per_layer_stats = false; + bool print_histogram = false; + bool reference = false; + std::vector include_layers; + std::vector exclude_layers; + std::vector include_types; +}; + +constexpr size_t HISTOGRAM_BUCKETS = 150; +constexpr double HISTOGRAM_RANGE = 0.03; + +struct error_stats { + size_t num_samples; + double total_error; + double max_error; + uint64_t error_histogram[HISTOGRAM_BUCKETS]; +}; + +static void quantize_stats_print_usage(int /*argc*/, char ** argv) { + quantize_stats_params params; + fprintf(stderr, "usage: %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " -m FNAME, --model FNAME\n"); + fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); + fprintf(stderr, " -r, --reference\n"); + fprintf(stderr, " use reference implementation (default: false)\n"); + fprintf(stderr, " -v, --verbose\n"); + fprintf(stderr, " verbose output (default: false)\n"); + fprintf(stderr, " -p, --per-layer-stats\n"); + fprintf(stderr, " print stats per layer (default: false)\n"); + fprintf(stderr, " --histogram\n"); + fprintf(stderr, " print error histogram (default: false)\n"); + fprintf(stderr, " -l LAYER, --include-layer LAYER\n"); + fprintf(stderr, " only test layers matching pattern\n"); + fprintf(stderr, " -L LAYER, --exclude-layer LAYER\n"); + fprintf(stderr, " exclude layers matching pattern\n"); + fprintf(stderr, " -t TYPE, --type TYPE\n"); + fprintf(stderr, " only test given type (q4_0, q4_1)\n"); + fprintf(stderr, "\n"); +} + +// Check if a layer is included/excluded by command line +static bool layer_included(const quantize_stats_params & params, const std::string & layer) { + for (const auto& excluded : params.exclude_layers) { + if (std::regex_search(layer, std::regex(excluded))) { + return false; + } + } + for (const auto& included : params.include_layers) { + if (std::regex_search(layer, std::regex(included))) { + return true; + } + } + return params.include_layers.empty(); +} + +// Update error statistics given vectors with the before/after result of quantization +static void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) { + for (int64_t i = 0; i < nelements; i++) { + double diff = input[i] - output[i]; + stats.total_error += diff * diff; + stats.max_error = fmax(fabs(diff), stats.max_error); + stats.error_histogram[std::max(std::min((size_t) floor(fabs(diff) / HISTOGRAM_RANGE * HISTOGRAM_BUCKETS), HISTOGRAM_BUCKETS-1), (size_t) 0)]++; + } + stats.num_samples += nelements; +} + +static void combine_error_stats(error_stats & into, const error_stats & from) { + into.num_samples += from.num_samples; + into.total_error += from.total_error; + if (from.max_error > into.max_error) into.max_error = from.max_error; + for (size_t i=0; i= sum*quantile) { + return (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS; + } + } + return INFINITY; +} + +static void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) { + double rmse = sqrt(stats.total_error / (double) stats.num_samples); + double median = find_quantile(stats, .5); + double pct95 = find_quantile(stats, .95); + printf("%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median); + if (print_histogram) { + printf("Error distribution:\n"); + for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) { + double lower = i * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS; + double upper = (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS; + if (i == HISTOGRAM_BUCKETS -1) upper = INFINITY; + printf("[%3.4f, %3.4f): %11" PRIu64 "\n", lower, upper, stats.error_histogram[i]); + } + } +} + +// copied from ggml.h - verify that we can access this as a flat array +static bool tensor_is_contiguous(const struct ggml_tensor * tensor) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return + tensor->nb[0] == ggml_type_size(tensor->type) && + tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) && + tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && + tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; +} + +static void test_roundtrip_on_chunk( + const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference, + float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats +) { + if (layer->type == GGML_TYPE_F16) { + for (int i = 0; i < chunk_size; i++) { + input_scratch[i] = ggml_get_f32_1d(layer, i + offset); + } + } else { + input_scratch = ggml_get_data_f32(layer) + offset; + } + + if (use_reference) { + qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size); + } else { + qfns_cpu.from_float(input_scratch, quantized_scratch, chunk_size); + } + qfns.to_float(quantized_scratch, output_scratch, chunk_size); + + update_error_stats(chunk_size, input_scratch, output_scratch, stats); +} + + +// Run quantization function for a single layer and update error stats +static void test_roundtrip_on_layer( + std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference, + const ggml_tensor * layer, std::vector & input_scratch, std::vector & quantized_scratch, + std::vector & output_scratch, error_stats & total_error, int max_thread = 0 +) { + assert(tensor_is_contiguous(layer)); + error_stats layer_error {}; + uint64_t nelements = ggml_nelements(layer); + + float* input_scratch_ptr = nullptr; + if (layer->type == GGML_TYPE_F16) { + if (input_scratch.size() < nelements) input_scratch.resize(nelements); + input_scratch_ptr = input_scratch.data(); + } + if (quantized_scratch.size() < 4*nelements) quantized_scratch.resize(4*nelements); + if (output_scratch.size() < nelements) output_scratch.resize(nelements); + + if (max_thread < 1) max_thread = std::thread::hardware_concurrency(); + int chunk_size = 32*512; + int num_chunks = (nelements + chunk_size - 1)/chunk_size; + + if (num_chunks < 2 || max_thread < 2) { + test_roundtrip_on_chunk(layer, 0, nelements, qfns, qfns_cpu, use_reference, input_scratch_ptr, quantized_scratch.data(), + output_scratch.data(), print_layer_stats ? layer_error : total_error); + } else { + auto & stats = print_layer_stats ? layer_error : total_error; + std::mutex mutex; + uint64_t counter = 0; + auto compute = [&mutex, &counter, &stats, &qfns, &qfns_cpu, nelements, layer, use_reference, input_scratch_ptr, + &quantized_scratch, &output_scratch, chunk_size] () { + error_stats local_stats {}; + while (true) { + std::unique_lock lock(mutex); + uint64_t offset = counter; counter += chunk_size; + if (offset >= nelements) { + combine_error_stats(stats, local_stats); + break; + } + lock.unlock(); + uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset; + test_roundtrip_on_chunk(layer, offset, chunk, qfns, qfns_cpu, use_reference, input_scratch_ptr + offset, + quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats); + } + }; + int nthread = std::min(num_chunks, max_thread); + std::vector workers(nthread-1); + for (auto& w : workers) w = std::thread(compute); + compute(); + for (auto& w : workers) w.join(); + } + + if (print_layer_stats) { + print_error_stats(name, layer_error, false); + combine_error_stats(total_error, layer_error); + } +} + +int main(int argc, char ** argv) { + ggml_time_init(); + + quantize_stats_params params; + + // read command line + + int max_thread = 0; + bool invalid_param = false; + std::string arg; + for (int i = 1; i < argc; i++) { + arg = argv[i]; + + if (arg == "-h" || arg == "--help") { + quantize_stats_print_usage(argc, argv); + exit(0); + } else if (arg == "-r" || arg == "--reference") { + params.reference = true; + } else if (arg == "-v") { + params.verbose = true; + } else if (arg == "-p" || arg == "--per-layer-stats") { + params.per_layer_stats = true; + } else if (arg == "--histogram") { + params.print_histogram = true; + } else if (arg == "-m" || arg == "--model") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model = argv[i]; + } else if (arg == "-l" || arg == "--include-layer") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.include_layers.emplace_back(argv[i]); + } else if (arg == "-L" || arg == "--exclude-layer") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.exclude_layers.emplace_back(argv[i]); + } else if (arg == "-t" || arg == "--type") { + if (++i >= argc) { + invalid_param = true; + break; + } + int j; + for (j = 0; j < GGML_TYPE_COUNT; ++j) { + const auto * name = ggml_type_name((ggml_type) j); + if (name && strcmp(argv[i], name) == 0) break; + } + if (j < GGML_TYPE_COUNT) { + params.include_types.push_back((ggml_type) j); + } else { + fprintf(stderr, "error: %s not in list of types\n", argv[i]); + invalid_param = true; + } + } else if (arg == "-n" || arg == "--num-threads") { + if (++i >= argc) { + invalid_param = true; + break; + } + max_thread = atoi(argv[i]); + } else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + quantize_stats_print_usage(argc, argv); + return 1; + } + } + if (invalid_param) { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + quantize_stats_print_usage(argc, argv); + return 1; + } + + print_build_info(); + + // load the model + fprintf(stderr, "Loading model\n"); + + const int64_t t_main_start_us = ggml_time_us(); + llama_model * model; + llama_context * ctx; + + { + auto mparams = llama_model_default_params(); + mparams.use_mlock = false; + + model = llama_model_load_from_file(params.model.c_str(), mparams); + + if (model == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + return 1; + } + + auto cparams = llama_context_default_params(); + cparams.n_ctx = 256; + + ctx = llama_init_from_model(model, cparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); + llama_model_free(model); + return 1; + } + } + + const auto & tensors = llama_internal_get_tensor_map(model); + + // check layer tensors + int included_layers = 0; + int64_t max_nelements = 0; + bool is_f16 = false; + for (const auto & kv_tensor : tensors) { + if (!layer_included(params, kv_tensor.first)) { + continue; + } + if (params.verbose) { + printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), ggml_type_name(kv_tensor.second->type), ggml_nelements(kv_tensor.second)); + } + if (kv_tensor.second->type == GGML_TYPE_F16) { + is_f16 = true; + } else if (kv_tensor.second->type != GGML_TYPE_F32) { + fprintf(stderr, "%s: error: Quantization should be tested with a float model, " + "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type); + llama_free(ctx); + llama_model_free(model); + return 1; + } + included_layers++; + max_nelements = std::max(max_nelements, ggml_nelements(kv_tensor.second)); + } + + if (is_f16) { + printf("note: source model is f16\n"); + } + printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements); + // allocate scratch space + std::vector input_scratch; + std::vector quantized_scratch; + std::vector output_scratch; + + // loop throught quantization types + for (int i = 0; i < GGML_TYPE_COUNT; i++) { + const ggml_type type = (ggml_type) i; + if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) { + continue; + } + const auto * qfns = ggml_get_type_traits(type); + const auto * qfns_cpu = ggml_get_type_traits_cpu(type); + if (qfns_cpu->from_float && qfns->to_float) { + if (params.verbose) { + printf("testing %s ...\n", ggml_type_name(type)); + } + + ggml_quantize_init(type); + + error_stats global_stats {}; + + for (const auto & kv_tensor : tensors) { + if (!layer_included(params, kv_tensor.first)) { + continue; + } + if (params.verbose) { + printf(" %s ...\n", kv_tensor.first.c_str()); + } + std::string layer_name { ggml_type_name(type) }; + layer_name += "::" + kv_tensor.first; + test_roundtrip_on_layer( + layer_name, + params.per_layer_stats, + *qfns, *qfns_cpu, + params.reference, + kv_tensor.second, + input_scratch, + quantized_scratch, + output_scratch, + global_stats, + max_thread + ); + } + + print_error_stats(ggml_type_name(type), global_stats, params.print_histogram); + } + } + + + llama_free(ctx); + llama_model_free(model); + // report timing + { + const int64_t t_main_end_us = ggml_time_us(); + + printf("\n"); + printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0); + } + + return 0; +} diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp index 55425d88..b183da47 100644 --- a/tests/test-tokenizer-1-bpe.cpp +++ b/tests/test-tokenizer-1-bpe.cpp @@ -1,8 +1,9 @@ #include "llama.h" #include "common.h" -#include "unicode.h" #include "console.h" +#include "../src/unicode.h" + #include #include #include diff --git a/tests/test-tokenizer-1-spm.cpp b/tests/test-tokenizer-1-spm.cpp index 9e7b77f3..ba6e94ba 100644 --- a/tests/test-tokenizer-1-spm.cpp +++ b/tests/test-tokenizer-1-spm.cpp @@ -1,8 +1,9 @@ #include "llama.h" #include "common.h" -#include "unicode.h" #include "console.h" +#include "../src/unicode.h" + #include #include #include