"llama-embedding",
"llama-eval-callback",
"llama-export-lora",
- "llama-gbnf-validator",
"llama-gen-docs",
"llama-gguf",
"llama-gguf-hash",
"llama-perplexity",
"llama-q8dot",
"llama-quantize",
- "llama-quantize-stats",
"llama-qwen2vl-cli",
"llama-retrieval",
"llama-run",
add_subdirectory(embedding)
add_subdirectory(eval-callback)
- if (NOT WIN32)
- # disabled on Windows because it uses internal functions not exported with LLAMA_API
- add_subdirectory(gbnf-validator)
- endif()
-
add_subdirectory(gguf-hash)
add_subdirectory(gguf-split)
add_subdirectory(gguf)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(cvector-generator)
add_subdirectory(export-lora)
- if (NOT WIN32)
- # disabled on Windows because it uses internal functions not exported with LLAMA_API
- add_subdirectory(quantize-stats)
- endif()
add_subdirectory(llava)
if (GGML_RPC)
add_subdirectory(rpc)
+++ /dev/null
-set(TARGET llama-gbnf-validator)
-add_executable(${TARGET} gbnf-validator.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
+++ /dev/null
-#include "unicode.h"
-#include "llama-grammar.h"
-
-#include <cstdio>
-#include <cstdlib>
-#include <sstream>
-#include <fstream>
-#include <string>
-#include <vector>
-
-static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
- const auto cpts = unicode_cpts_from_utf8(input_str);
-
- auto & stacks_cur = llama_grammar_get_stacks(grammar);
-
- size_t pos = 0;
- for (const auto & cpt : cpts) {
- llama_grammar_accept(grammar, cpt);
-
- if (stacks_cur.empty()) {
- error_pos = pos;
- error_msg = "Unexpected character '" + unicode_cpt_to_utf8(cpt) + "'";
- return false;
- }
- ++pos;
- }
-
- for (const auto & stack : stacks_cur) {
- if (stack.empty()) {
- return true;
- }
- }
-
- error_pos = pos;
- error_msg = "Unexpected end of input";
- return false;
-}
-
-static void print_error_message(const std::string & input_str, size_t error_pos, const std::string & error_msg) {
- fprintf(stdout, "Input string is invalid according to the grammar.\n");
- fprintf(stdout, "Error: %s at position %zu\n", error_msg.c_str(), error_pos);
- fprintf(stdout, "\n");
- fprintf(stdout, "Input string:\n");
- fprintf(stdout, "%s", input_str.substr(0, error_pos).c_str());
- if (error_pos < input_str.size()) {
- fprintf(stdout, "\033[1;31m%c", input_str[error_pos]);
- if (error_pos+1 < input_str.size()) {
- fprintf(stdout, "\033[0;31m%s", input_str.substr(error_pos+1).c_str());
- }
- fprintf(stdout, "\033[0m\n");
- }
-}
-
-int main(int argc, char** argv) {
- if (argc != 3) {
- fprintf(stdout, "Usage: %s <grammar_filename> <input_filename>\n", argv[0]);
- return 1;
- }
-
- const std::string grammar_filename = argv[1];
- const std::string input_filename = argv[2];
-
- // Read the GBNF grammar file
- FILE* grammar_file = fopen(grammar_filename.c_str(), "r");
- if (!grammar_file) {
- fprintf(stdout, "Failed to open grammar file: %s\n", grammar_filename.c_str());
- return 1;
- }
-
- std::string grammar_str;
- {
- std::ifstream grammar_file(grammar_filename);
- GGML_ASSERT(grammar_file.is_open() && "Failed to open grammar file");
- std::stringstream buffer;
- buffer << grammar_file.rdbuf();
- grammar_str = buffer.str();
- }
-
- llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root", false, nullptr, 0, nullptr, 0);
- if (grammar == nullptr) {
- fprintf(stdout, "Failed to initialize llama_grammar\n");
- return 1;
- }
- // Read the input file
- std::string input_str;
- {
- std::ifstream input_file(input_filename);
- GGML_ASSERT(input_file.is_open() && "Failed to open input file");
- std::stringstream buffer;
- buffer << input_file.rdbuf();
- input_str = buffer.str();
- }
-
- // Validate the input string against the grammar
- size_t error_pos;
- std::string error_msg;
- bool is_valid = llama_grammar_validate(grammar, input_str, error_pos, error_msg);
-
- if (is_valid) {
- fprintf(stdout, "Input string is valid according to the grammar.\n");
- } else {
- print_error_message(input_str, error_pos, error_msg);
- }
-
- // Clean up
- llama_grammar_free_impl(grammar);
-
- return 0;
-}
+++ /dev/null
-set(TARGET llama-quantize-stats)
-add_executable(${TARGET} quantize-stats.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
-target_include_directories(${TARGET} PRIVATE ../../common)
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
+++ /dev/null
-#include "ggml.h"
-#include "llama.h"
-#include "llama-model.h"
-#include "common.h"
-
-#include <algorithm>
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <numeric>
-#include <regex>
-#include <string>
-#include <vector>
-#include <thread>
-#include <mutex>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-struct quantize_stats_params {
- std::string model = DEFAULT_MODEL_PATH;
- bool verbose = false;
- bool per_layer_stats = false;
- bool print_histogram = false;
- bool reference = false;
- std::vector<std::string> include_layers;
- std::vector<std::string> exclude_layers;
- std::vector<enum ggml_type> include_types;
-};
-
-constexpr size_t HISTOGRAM_BUCKETS = 150;
-constexpr double HISTOGRAM_RANGE = 0.03;
-
-struct error_stats {
- size_t num_samples;
- double total_error;
- double max_error;
- uint64_t error_histogram[HISTOGRAM_BUCKETS];
-};
-
-static void quantize_stats_print_usage(int /*argc*/, char ** argv) {
- quantize_stats_params params;
- fprintf(stderr, "usage: %s [options]\n", argv[0]);
- fprintf(stderr, "\n");
- fprintf(stderr, "options:\n");
- fprintf(stderr, " -h, --help show this help message and exit\n");
- fprintf(stderr, " -m FNAME, --model FNAME\n");
- fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
- fprintf(stderr, " -r, --reference\n");
- fprintf(stderr, " use reference implementation (default: false)\n");
- fprintf(stderr, " -v, --verbose\n");
- fprintf(stderr, " verbose output (default: false)\n");
- fprintf(stderr, " -p, --per-layer-stats\n");
- fprintf(stderr, " print stats per layer (default: false)\n");
- fprintf(stderr, " --histogram\n");
- fprintf(stderr, " print error histogram (default: false)\n");
- fprintf(stderr, " -l LAYER, --include-layer LAYER\n");
- fprintf(stderr, " only test layers matching pattern\n");
- fprintf(stderr, " -L LAYER, --exclude-layer LAYER\n");
- fprintf(stderr, " exclude layers matching pattern\n");
- fprintf(stderr, " -t TYPE, --type TYPE\n");
- fprintf(stderr, " only test given type (q4_0, q4_1)\n");
- fprintf(stderr, "\n");
-}
-
-// Check if a layer is included/excluded by command line
-static bool layer_included(const quantize_stats_params & params, const std::string & layer) {
- for (const auto& excluded : params.exclude_layers) {
- if (std::regex_search(layer, std::regex(excluded))) {
- return false;
- }
- }
- for (const auto& included : params.include_layers) {
- if (std::regex_search(layer, std::regex(included))) {
- return true;
- }
- }
- return params.include_layers.empty();
-}
-
-// Update error statistics given vectors with the before/after result of quantization
-static void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
- for (int64_t i = 0; i < nelements; i++) {
- double diff = input[i] - output[i];
- stats.total_error += diff * diff;
- stats.max_error = fmax(fabs(diff), stats.max_error);
- stats.error_histogram[std::max(std::min((size_t) floor(fabs(diff) / HISTOGRAM_RANGE * HISTOGRAM_BUCKETS), HISTOGRAM_BUCKETS-1), (size_t) 0)]++;
- }
- stats.num_samples += nelements;
-}
-
-static void combine_error_stats(error_stats & into, const error_stats & from) {
- into.num_samples += from.num_samples;
- into.total_error += from.total_error;
- if (from.max_error > into.max_error) into.max_error = from.max_error;
- for (size_t i=0; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i];
-}
-
-static double find_quantile(const error_stats & stats, double quantile) {
- double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0);
-
- double accum = 0;
- for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
- accum += stats.error_histogram[i];
- if (accum >= sum*quantile) {
- return (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
- }
- }
- return INFINITY;
-}
-
-static void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
- double rmse = sqrt(stats.total_error / (double) stats.num_samples);
- double median = find_quantile(stats, .5);
- double pct95 = find_quantile(stats, .95);
- printf("%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median);
- if (print_histogram) {
- printf("Error distribution:\n");
- for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
- double lower = i * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
- double upper = (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
- if (i == HISTOGRAM_BUCKETS -1) upper = INFINITY;
- printf("[%3.4f, %3.4f): %11" PRIu64 "\n", lower, upper, stats.error_histogram[i]);
- }
- }
-}
-
-// copied from ggml.h - verify that we can access this as a flat array
-static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
- static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
- return
- tensor->nb[0] == ggml_type_size(tensor->type) &&
- tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
- tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
- tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
-}
-
-static void test_roundtrip_on_chunk(
- const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
- float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
-) {
- if (layer->type == GGML_TYPE_F16) {
- for (int i = 0; i < chunk_size; i++) {
- input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
- }
- } else {
- input_scratch = ggml_get_data_f32(layer) + offset;
- }
-
- if (use_reference) {
- qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size);
- } else {
- qfns_cpu.from_float(input_scratch, quantized_scratch, chunk_size);
- }
- qfns.to_float(quantized_scratch, output_scratch, chunk_size);
-
- update_error_stats(chunk_size, input_scratch, output_scratch, stats);
-}
-
-
-// Run quantization function for a single layer and update error stats
-static void test_roundtrip_on_layer(
- std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
- const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
- std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
-) {
- assert(tensor_is_contiguous(layer));
- error_stats layer_error {};
- uint64_t nelements = ggml_nelements(layer);
-
- float* input_scratch_ptr = nullptr;
- if (layer->type == GGML_TYPE_F16) {
- if (input_scratch.size() < nelements) input_scratch.resize(nelements);
- input_scratch_ptr = input_scratch.data();
- }
- if (quantized_scratch.size() < 4*nelements) quantized_scratch.resize(4*nelements);
- if (output_scratch.size() < nelements) output_scratch.resize(nelements);
-
- if (max_thread < 1) max_thread = std::thread::hardware_concurrency();
- int chunk_size = 32*512;
- int num_chunks = (nelements + chunk_size - 1)/chunk_size;
-
- if (num_chunks < 2 || max_thread < 2) {
- test_roundtrip_on_chunk(layer, 0, nelements, qfns, qfns_cpu, use_reference, input_scratch_ptr, quantized_scratch.data(),
- output_scratch.data(), print_layer_stats ? layer_error : total_error);
- } else {
- auto & stats = print_layer_stats ? layer_error : total_error;
- std::mutex mutex;
- uint64_t counter = 0;
- auto compute = [&mutex, &counter, &stats, &qfns, &qfns_cpu, nelements, layer, use_reference, input_scratch_ptr,
- &quantized_scratch, &output_scratch, chunk_size] () {
- error_stats local_stats {};
- while (true) {
- std::unique_lock<std::mutex> lock(mutex);
- uint64_t offset = counter; counter += chunk_size;
- if (offset >= nelements) {
- combine_error_stats(stats, local_stats);
- break;
- }
- lock.unlock();
- uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset;
- test_roundtrip_on_chunk(layer, offset, chunk, qfns, qfns_cpu, use_reference, input_scratch_ptr + offset,
- quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats);
- }
- };
- int nthread = std::min(num_chunks, max_thread);
- std::vector<std::thread> workers(nthread-1);
- for (auto& w : workers) w = std::thread(compute);
- compute();
- for (auto& w : workers) w.join();
- }
-
- if (print_layer_stats) {
- print_error_stats(name, layer_error, false);
- combine_error_stats(total_error, layer_error);
- }
-}
-
-int main(int argc, char ** argv) {
- ggml_time_init();
-
- quantize_stats_params params;
-
- // read command line
-
- int max_thread = 0;
- bool invalid_param = false;
- std::string arg;
- for (int i = 1; i < argc; i++) {
- arg = argv[i];
-
- if (arg == "-h" || arg == "--help") {
- quantize_stats_print_usage(argc, argv);
- exit(0);
- } else if (arg == "-r" || arg == "--reference") {
- params.reference = true;
- } else if (arg == "-v") {
- params.verbose = true;
- } else if (arg == "-p" || arg == "--per-layer-stats") {
- params.per_layer_stats = true;
- } else if (arg == "--histogram") {
- params.print_histogram = true;
- } else if (arg == "-m" || arg == "--model") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.model = argv[i];
- } else if (arg == "-l" || arg == "--include-layer") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.include_layers.emplace_back(argv[i]);
- } else if (arg == "-L" || arg == "--exclude-layer") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- params.exclude_layers.emplace_back(argv[i]);
- } else if (arg == "-t" || arg == "--type") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- int j;
- for (j = 0; j < GGML_TYPE_COUNT; ++j) {
- const auto * name = ggml_type_name((ggml_type) j);
- if (name && strcmp(argv[i], name) == 0) break;
- }
- if (j < GGML_TYPE_COUNT) {
- params.include_types.push_back((ggml_type) j);
- } else {
- fprintf(stderr, "error: %s not in list of types\n", argv[i]);
- invalid_param = true;
- }
- } else if (arg == "-n" || arg == "--num-threads") {
- if (++i >= argc) {
- invalid_param = true;
- break;
- }
- max_thread = atoi(argv[i]);
- } else {
- fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
- quantize_stats_print_usage(argc, argv);
- return 1;
- }
- }
- if (invalid_param) {
- fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
- quantize_stats_print_usage(argc, argv);
- return 1;
- }
-
- print_build_info();
-
- // load the model
- fprintf(stderr, "Loading model\n");
-
- const int64_t t_main_start_us = ggml_time_us();
- llama_model * model;
- llama_context * ctx;
-
- {
- auto mparams = llama_model_default_params();
- mparams.use_mlock = false;
-
- model = llama_model_load_from_file(params.model.c_str(), mparams);
-
- if (model == NULL) {
- fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
- return 1;
- }
-
- auto cparams = llama_context_default_params();
- cparams.n_ctx = 256;
-
- ctx = llama_init_from_model(model, cparams);
-
- if (ctx == NULL) {
- fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
- llama_model_free(model);
- return 1;
- }
- }
-
- const auto & tensors = llama_internal_get_tensor_map(model);
-
- // check layer tensors
- int included_layers = 0;
- int64_t max_nelements = 0;
- bool is_f16 = false;
- for (const auto & kv_tensor : tensors) {
- if (!layer_included(params, kv_tensor.first)) {
- continue;
- }
- if (params.verbose) {
- printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), ggml_type_name(kv_tensor.second->type), ggml_nelements(kv_tensor.second));
- }
- if (kv_tensor.second->type == GGML_TYPE_F16) {
- is_f16 = true;
- } else if (kv_tensor.second->type != GGML_TYPE_F32) {
- fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
- "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
- llama_free(ctx);
- llama_model_free(model);
- return 1;
- }
- included_layers++;
- max_nelements = std::max(max_nelements, ggml_nelements(kv_tensor.second));
- }
-
- if (is_f16) {
- printf("note: source model is f16\n");
- }
- printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements);
- // allocate scratch space
- std::vector<float> input_scratch;
- std::vector<char> quantized_scratch;
- std::vector<float> output_scratch;
-
- // loop throught quantization types
- for (int i = 0; i < GGML_TYPE_COUNT; i++) {
- const ggml_type type = (ggml_type) i;
- if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
- continue;
- }
- const auto * qfns = ggml_get_type_traits(type);
- const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
- if (qfns_cpu->from_float && qfns->to_float) {
- if (params.verbose) {
- printf("testing %s ...\n", ggml_type_name(type));
- }
-
- ggml_quantize_init(type);
-
- error_stats global_stats {};
-
- for (const auto & kv_tensor : tensors) {
- if (!layer_included(params, kv_tensor.first)) {
- continue;
- }
- if (params.verbose) {
- printf(" %s ...\n", kv_tensor.first.c_str());
- }
- std::string layer_name { ggml_type_name(type) };
- layer_name += "::" + kv_tensor.first;
- test_roundtrip_on_layer(
- layer_name,
- params.per_layer_stats,
- *qfns, *qfns_cpu,
- params.reference,
- kv_tensor.second,
- input_scratch,
- quantized_scratch,
- output_scratch,
- global_stats,
- max_thread
- );
- }
-
- print_error_stats(ggml_type_name(type), global_stats, params.print_histogram);
- }
- }
-
-
- llama_free(ctx);
- llama_model_free(model);
- // report timing
- {
- const int64_t t_main_end_us = ggml_time_us();
-
- printf("\n");
- printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
- }
-
- return 0;
-}
- In [llama-server](../examples/server)'s completion endpoints, passed as the `grammar` body field
- In [llama-cli](../examples/main), passed as the `--grammar` & `--grammar-file` flags
-- With [llama-gbnf-validator](../examples/gbnf-validator) tool, to test them against strings.
+- With [test-gbnf-validator](../tests/test-gbnf-validator.cpp), to test them against strings.
## JSON Schemas → GBNF
unicode.h
)
-target_include_directories(llama PUBLIC . ../include)
-target_compile_features (llama PUBLIC cxx_std_17) # don't bump
+target_include_directories(llama PRIVATE .)
+target_include_directories(llama PUBLIC ../include)
+target_compile_features (llama PRIVATE cxx_std_17) # don't bump
target_link_libraries(llama PUBLIC ggml)
llama_add_compile_flags()
+function(llama_build source)
+ if (DEFINED LLAMA_TEST_NAME)
+ set(TEST_TARGET ${LLAMA_TEST_NAME})
+ else()
+ get_filename_component(TEST_TARGET ${source} NAME_WE)
+ endif()
+
+ add_executable(${TEST_TARGET} ${source})
+ target_link_libraries(${TEST_TARGET} PRIVATE common)
+ install(TARGETS ${TEST_TARGET} RUNTIME)
+endfunction()
+
function(llama_test target)
include(CMakeParseArguments)
set(options)
# - LABEL: label for the test (defaults to main)
# - ARGS: arguments to pass to the test executable
# - WORKING_DIRECTORY
-function(llama_target_and_test source)
+function(llama_build_and_test source)
include(CMakeParseArguments)
set(options)
set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
add_executable(${TEST_TARGET} ${source} get-model.cpp)
install(TARGETS ${TEST_TARGET} RUNTIME)
target_link_libraries(${TEST_TARGET} PRIVATE common)
+
add_test(
NAME ${TEST_TARGET}
WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
endfunction()
# build test-tokenizer-0 target once and add many tests
-add_executable(test-tokenizer-0 test-tokenizer-0.cpp)
-target_link_libraries(test-tokenizer-0 PRIVATE common)
-install(TARGETS test-tokenizer-0 RUNTIME)
+llama_build(test-tokenizer-0.cpp)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
if (LLAMA_LLGUIDANCE)
- llama_target_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
+ llama_build_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
endif ()
if (NOT WIN32)
# these tests are disabled on Windows because they use internal functions not exported with LLAMA_API
- llama_target_and_test(test-sampling.cpp)
- llama_target_and_test(test-grammar-parser.cpp)
- llama_target_and_test(test-grammar-integration.cpp)
- llama_target_and_test(test-llama-grammar.cpp)
- llama_target_and_test(test-chat.cpp)
+ llama_build_and_test(test-sampling.cpp)
+ llama_build_and_test(test-grammar-parser.cpp)
+ llama_build_and_test(test-grammar-integration.cpp)
+ llama_build_and_test(test-llama-grammar.cpp)
+ llama_build_and_test(test-chat.cpp)
# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
- llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
+ llama_build_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
endif()
+ llama_build(test-quantize-stats.cpp)
+ llama_build(test-gbnf-validator.cpp)
# build test-tokenizer-1-bpe target once and add many tests
- add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
- target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
- install(TARGETS test-tokenizer-1-bpe RUNTIME)
+ llama_build(test-tokenizer-1-bpe.cpp)
# TODO: disabled due to slowness
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
# build test-tokenizer-1-spm target once and add many tests
- add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp)
- target_link_libraries(test-tokenizer-1-spm PRIVATE common)
- install(TARGETS test-tokenizer-1-spm RUNTIME)
+ llama_build(test-tokenizer-1-spm.cpp)
llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
#llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
- # llama_target_and_test(test-double-float.cpp) # SLOW
+ # llama_build_and_test(test-double-float.cpp) # SLOW
endif()
-llama_target_and_test(test-log.cpp)
-llama_target_and_test(test-chat-template.cpp)
+llama_build_and_test(test-log.cpp)
+llama_build_and_test(test-chat-template.cpp)
# this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135)
if (NOT WIN32)
- llama_target_and_test(test-arg-parser.cpp)
+ llama_build_and_test(test-arg-parser.cpp)
endif()
-# llama_target_and_test(test-opt.cpp) # SLOW
-llama_target_and_test(test-gguf.cpp)
-llama_target_and_test(test-backend-ops.cpp)
+# llama_build_and_test(test-opt.cpp) # SLOW
+llama_build_and_test(test-gguf.cpp)
+llama_build_and_test(test-backend-ops.cpp)
-llama_target_and_test(test-model-load-cancel.cpp LABEL "model")
-llama_target_and_test(test-autorelease.cpp LABEL "model")
+llama_build_and_test(test-model-load-cancel.cpp LABEL "model")
+llama_build_and_test(test-autorelease.cpp LABEL "model")
if (NOT GGML_BACKEND_DL)
# these tests use the backends directly and cannot be built with dynamic loading
- llama_target_and_test(test-barrier.cpp)
- llama_target_and_test(test-quantize-fns.cpp)
- llama_target_and_test(test-quantize-perf.cpp)
- llama_target_and_test(test-rope.cpp)
+ llama_build_and_test(test-barrier.cpp)
+ llama_build_and_test(test-quantize-fns.cpp)
+ llama_build_and_test(test-quantize-perf.cpp)
+ llama_build_and_test(test-rope.cpp)
endif()
#include <string>
#include "chat.h"
-#include "llama-grammar.h"
-#include "unicode.h"
+
+#include "../src/unicode.h"
+#include "../src/llama-grammar.h"
using json = nlohmann::ordered_json;
--- /dev/null
+#include "../src/unicode.h"
+#include "../src/llama-grammar.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <sstream>
+#include <fstream>
+#include <string>
+#include <vector>
+
+static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
+ const auto cpts = unicode_cpts_from_utf8(input_str);
+
+ auto & stacks_cur = llama_grammar_get_stacks(grammar);
+
+ size_t pos = 0;
+ for (const auto & cpt : cpts) {
+ llama_grammar_accept(grammar, cpt);
+
+ if (stacks_cur.empty()) {
+ error_pos = pos;
+ error_msg = "Unexpected character '" + unicode_cpt_to_utf8(cpt) + "'";
+ return false;
+ }
+ ++pos;
+ }
+
+ for (const auto & stack : stacks_cur) {
+ if (stack.empty()) {
+ return true;
+ }
+ }
+
+ error_pos = pos;
+ error_msg = "Unexpected end of input";
+ return false;
+}
+
+static void print_error_message(const std::string & input_str, size_t error_pos, const std::string & error_msg) {
+ fprintf(stdout, "Input string is invalid according to the grammar.\n");
+ fprintf(stdout, "Error: %s at position %zu\n", error_msg.c_str(), error_pos);
+ fprintf(stdout, "\n");
+ fprintf(stdout, "Input string:\n");
+ fprintf(stdout, "%s", input_str.substr(0, error_pos).c_str());
+ if (error_pos < input_str.size()) {
+ fprintf(stdout, "\033[1;31m%c", input_str[error_pos]);
+ if (error_pos+1 < input_str.size()) {
+ fprintf(stdout, "\033[0;31m%s", input_str.substr(error_pos+1).c_str());
+ }
+ fprintf(stdout, "\033[0m\n");
+ }
+}
+
+int main(int argc, char** argv) {
+ if (argc != 3) {
+ fprintf(stdout, "Usage: %s <grammar_filename> <input_filename>\n", argv[0]);
+ return 1;
+ }
+
+ const std::string grammar_filename = argv[1];
+ const std::string input_filename = argv[2];
+
+ // Read the GBNF grammar file
+ FILE* grammar_file = fopen(grammar_filename.c_str(), "r");
+ if (!grammar_file) {
+ fprintf(stdout, "Failed to open grammar file: %s\n", grammar_filename.c_str());
+ return 1;
+ }
+
+ std::string grammar_str;
+ {
+ std::ifstream grammar_file(grammar_filename);
+ GGML_ASSERT(grammar_file.is_open() && "Failed to open grammar file");
+ std::stringstream buffer;
+ buffer << grammar_file.rdbuf();
+ grammar_str = buffer.str();
+ }
+
+ llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root", false, nullptr, 0, nullptr, 0);
+ if (grammar == nullptr) {
+ fprintf(stdout, "Failed to initialize llama_grammar\n");
+ return 1;
+ }
+ // Read the input file
+ std::string input_str;
+ {
+ std::ifstream input_file(input_filename);
+ GGML_ASSERT(input_file.is_open() && "Failed to open input file");
+ std::stringstream buffer;
+ buffer << input_file.rdbuf();
+ input_str = buffer.str();
+ }
+
+ // Validate the input string against the grammar
+ size_t error_pos;
+ std::string error_msg;
+ bool is_valid = llama_grammar_validate(grammar, input_str, error_pos, error_msg);
+
+ if (is_valid) {
+ fprintf(stdout, "Input string is valid according to the grammar.\n");
+ } else {
+ print_error_message(input_str, error_pos, error_msg);
+ }
+
+ // Clean up
+ llama_grammar_free_impl(grammar);
+
+ return 0;
+}
#undef NDEBUG
#endif
-#include "unicode.h"
-#include "llama-grammar.h"
#include "json-schema-to-grammar.h"
+#include "../src/unicode.h"
+#include "../src/llama-grammar.h"
+
#include <cassert>
#include <string>
#include <vector>
# undef NDEBUG
#endif
-#include "unicode.h"
#include "sampling.h"
#include <cassert>
fprintf(stderr,
"\n NOTE: Debug grammar file generated. To analyze this failure in detail, run the following "
- "command: ./llama-gbnf-validator test-grammar-integration.grammar.gbnf "
+ "command: ./test-gbnf-validator test-grammar-integration.grammar.gbnf "
"test-grammar-integration.string.txt\n\n");
} else {
fprintf(stdout, "✅︎\n");
#endif
#include "llama.h"
-#include "llama-grammar.h"
+
+// TODO: shold not include libllama sources
+#include "../src/llama-grammar.h"
#include <cassert>
#include "json-schema-to-grammar.h"
-#include "llama-grammar.h"
+#include "../src/llama-grammar.h"
#include <cassert>
#include <fstream>
#endif
#include "llama.h"
-#include "llama-grammar.h"
+
+#include "../src/llama-grammar.h"
#include <cassert>
#include <stdexcept>
--- /dev/null
+#include "ggml.h"
+#include "llama.h"
+#include "common.h"
+
+#include "../src/llama-model.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <numeric>
+#include <regex>
+#include <string>
+#include <vector>
+#include <thread>
+#include <mutex>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+struct quantize_stats_params {
+ std::string model = DEFAULT_MODEL_PATH;
+ bool verbose = false;
+ bool per_layer_stats = false;
+ bool print_histogram = false;
+ bool reference = false;
+ std::vector<std::string> include_layers;
+ std::vector<std::string> exclude_layers;
+ std::vector<enum ggml_type> include_types;
+};
+
+constexpr size_t HISTOGRAM_BUCKETS = 150;
+constexpr double HISTOGRAM_RANGE = 0.03;
+
+struct error_stats {
+ size_t num_samples;
+ double total_error;
+ double max_error;
+ uint64_t error_histogram[HISTOGRAM_BUCKETS];
+};
+
+static void quantize_stats_print_usage(int /*argc*/, char ** argv) {
+ quantize_stats_params params;
+ fprintf(stderr, "usage: %s [options]\n", argv[0]);
+ fprintf(stderr, "\n");
+ fprintf(stderr, "options:\n");
+ fprintf(stderr, " -h, --help show this help message and exit\n");
+ fprintf(stderr, " -m FNAME, --model FNAME\n");
+ fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
+ fprintf(stderr, " -r, --reference\n");
+ fprintf(stderr, " use reference implementation (default: false)\n");
+ fprintf(stderr, " -v, --verbose\n");
+ fprintf(stderr, " verbose output (default: false)\n");
+ fprintf(stderr, " -p, --per-layer-stats\n");
+ fprintf(stderr, " print stats per layer (default: false)\n");
+ fprintf(stderr, " --histogram\n");
+ fprintf(stderr, " print error histogram (default: false)\n");
+ fprintf(stderr, " -l LAYER, --include-layer LAYER\n");
+ fprintf(stderr, " only test layers matching pattern\n");
+ fprintf(stderr, " -L LAYER, --exclude-layer LAYER\n");
+ fprintf(stderr, " exclude layers matching pattern\n");
+ fprintf(stderr, " -t TYPE, --type TYPE\n");
+ fprintf(stderr, " only test given type (q4_0, q4_1)\n");
+ fprintf(stderr, "\n");
+}
+
+// Check if a layer is included/excluded by command line
+static bool layer_included(const quantize_stats_params & params, const std::string & layer) {
+ for (const auto& excluded : params.exclude_layers) {
+ if (std::regex_search(layer, std::regex(excluded))) {
+ return false;
+ }
+ }
+ for (const auto& included : params.include_layers) {
+ if (std::regex_search(layer, std::regex(included))) {
+ return true;
+ }
+ }
+ return params.include_layers.empty();
+}
+
+// Update error statistics given vectors with the before/after result of quantization
+static void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
+ for (int64_t i = 0; i < nelements; i++) {
+ double diff = input[i] - output[i];
+ stats.total_error += diff * diff;
+ stats.max_error = fmax(fabs(diff), stats.max_error);
+ stats.error_histogram[std::max(std::min((size_t) floor(fabs(diff) / HISTOGRAM_RANGE * HISTOGRAM_BUCKETS), HISTOGRAM_BUCKETS-1), (size_t) 0)]++;
+ }
+ stats.num_samples += nelements;
+}
+
+static void combine_error_stats(error_stats & into, const error_stats & from) {
+ into.num_samples += from.num_samples;
+ into.total_error += from.total_error;
+ if (from.max_error > into.max_error) into.max_error = from.max_error;
+ for (size_t i=0; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i];
+}
+
+static double find_quantile(const error_stats & stats, double quantile) {
+ double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0);
+
+ double accum = 0;
+ for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
+ accum += stats.error_histogram[i];
+ if (accum >= sum*quantile) {
+ return (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
+ }
+ }
+ return INFINITY;
+}
+
+static void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
+ double rmse = sqrt(stats.total_error / (double) stats.num_samples);
+ double median = find_quantile(stats, .5);
+ double pct95 = find_quantile(stats, .95);
+ printf("%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median);
+ if (print_histogram) {
+ printf("Error distribution:\n");
+ for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
+ double lower = i * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
+ double upper = (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
+ if (i == HISTOGRAM_BUCKETS -1) upper = INFINITY;
+ printf("[%3.4f, %3.4f): %11" PRIu64 "\n", lower, upper, stats.error_histogram[i]);
+ }
+ }
+}
+
+// copied from ggml.h - verify that we can access this as a flat array
+static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
+ static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+ return
+ tensor->nb[0] == ggml_type_size(tensor->type) &&
+ tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
+ tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
+ tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
+}
+
+static void test_roundtrip_on_chunk(
+ const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
+ float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
+) {
+ if (layer->type == GGML_TYPE_F16) {
+ for (int i = 0; i < chunk_size; i++) {
+ input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
+ }
+ } else {
+ input_scratch = ggml_get_data_f32(layer) + offset;
+ }
+
+ if (use_reference) {
+ qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size);
+ } else {
+ qfns_cpu.from_float(input_scratch, quantized_scratch, chunk_size);
+ }
+ qfns.to_float(quantized_scratch, output_scratch, chunk_size);
+
+ update_error_stats(chunk_size, input_scratch, output_scratch, stats);
+}
+
+
+// Run quantization function for a single layer and update error stats
+static void test_roundtrip_on_layer(
+ std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
+ const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
+ std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
+) {
+ assert(tensor_is_contiguous(layer));
+ error_stats layer_error {};
+ uint64_t nelements = ggml_nelements(layer);
+
+ float* input_scratch_ptr = nullptr;
+ if (layer->type == GGML_TYPE_F16) {
+ if (input_scratch.size() < nelements) input_scratch.resize(nelements);
+ input_scratch_ptr = input_scratch.data();
+ }
+ if (quantized_scratch.size() < 4*nelements) quantized_scratch.resize(4*nelements);
+ if (output_scratch.size() < nelements) output_scratch.resize(nelements);
+
+ if (max_thread < 1) max_thread = std::thread::hardware_concurrency();
+ int chunk_size = 32*512;
+ int num_chunks = (nelements + chunk_size - 1)/chunk_size;
+
+ if (num_chunks < 2 || max_thread < 2) {
+ test_roundtrip_on_chunk(layer, 0, nelements, qfns, qfns_cpu, use_reference, input_scratch_ptr, quantized_scratch.data(),
+ output_scratch.data(), print_layer_stats ? layer_error : total_error);
+ } else {
+ auto & stats = print_layer_stats ? layer_error : total_error;
+ std::mutex mutex;
+ uint64_t counter = 0;
+ auto compute = [&mutex, &counter, &stats, &qfns, &qfns_cpu, nelements, layer, use_reference, input_scratch_ptr,
+ &quantized_scratch, &output_scratch, chunk_size] () {
+ error_stats local_stats {};
+ while (true) {
+ std::unique_lock<std::mutex> lock(mutex);
+ uint64_t offset = counter; counter += chunk_size;
+ if (offset >= nelements) {
+ combine_error_stats(stats, local_stats);
+ break;
+ }
+ lock.unlock();
+ uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset;
+ test_roundtrip_on_chunk(layer, offset, chunk, qfns, qfns_cpu, use_reference, input_scratch_ptr + offset,
+ quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats);
+ }
+ };
+ int nthread = std::min(num_chunks, max_thread);
+ std::vector<std::thread> workers(nthread-1);
+ for (auto& w : workers) w = std::thread(compute);
+ compute();
+ for (auto& w : workers) w.join();
+ }
+
+ if (print_layer_stats) {
+ print_error_stats(name, layer_error, false);
+ combine_error_stats(total_error, layer_error);
+ }
+}
+
+int main(int argc, char ** argv) {
+ ggml_time_init();
+
+ quantize_stats_params params;
+
+ // read command line
+
+ int max_thread = 0;
+ bool invalid_param = false;
+ std::string arg;
+ for (int i = 1; i < argc; i++) {
+ arg = argv[i];
+
+ if (arg == "-h" || arg == "--help") {
+ quantize_stats_print_usage(argc, argv);
+ exit(0);
+ } else if (arg == "-r" || arg == "--reference") {
+ params.reference = true;
+ } else if (arg == "-v") {
+ params.verbose = true;
+ } else if (arg == "-p" || arg == "--per-layer-stats") {
+ params.per_layer_stats = true;
+ } else if (arg == "--histogram") {
+ params.print_histogram = true;
+ } else if (arg == "-m" || arg == "--model") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.model = argv[i];
+ } else if (arg == "-l" || arg == "--include-layer") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.include_layers.emplace_back(argv[i]);
+ } else if (arg == "-L" || arg == "--exclude-layer") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.exclude_layers.emplace_back(argv[i]);
+ } else if (arg == "-t" || arg == "--type") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ int j;
+ for (j = 0; j < GGML_TYPE_COUNT; ++j) {
+ const auto * name = ggml_type_name((ggml_type) j);
+ if (name && strcmp(argv[i], name) == 0) break;
+ }
+ if (j < GGML_TYPE_COUNT) {
+ params.include_types.push_back((ggml_type) j);
+ } else {
+ fprintf(stderr, "error: %s not in list of types\n", argv[i]);
+ invalid_param = true;
+ }
+ } else if (arg == "-n" || arg == "--num-threads") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ max_thread = atoi(argv[i]);
+ } else {
+ fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+ quantize_stats_print_usage(argc, argv);
+ return 1;
+ }
+ }
+ if (invalid_param) {
+ fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+ quantize_stats_print_usage(argc, argv);
+ return 1;
+ }
+
+ print_build_info();
+
+ // load the model
+ fprintf(stderr, "Loading model\n");
+
+ const int64_t t_main_start_us = ggml_time_us();
+ llama_model * model;
+ llama_context * ctx;
+
+ {
+ auto mparams = llama_model_default_params();
+ mparams.use_mlock = false;
+
+ model = llama_model_load_from_file(params.model.c_str(), mparams);
+
+ if (model == NULL) {
+ fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+ return 1;
+ }
+
+ auto cparams = llama_context_default_params();
+ cparams.n_ctx = 256;
+
+ ctx = llama_init_from_model(model, cparams);
+
+ if (ctx == NULL) {
+ fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
+ llama_model_free(model);
+ return 1;
+ }
+ }
+
+ const auto & tensors = llama_internal_get_tensor_map(model);
+
+ // check layer tensors
+ int included_layers = 0;
+ int64_t max_nelements = 0;
+ bool is_f16 = false;
+ for (const auto & kv_tensor : tensors) {
+ if (!layer_included(params, kv_tensor.first)) {
+ continue;
+ }
+ if (params.verbose) {
+ printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), ggml_type_name(kv_tensor.second->type), ggml_nelements(kv_tensor.second));
+ }
+ if (kv_tensor.second->type == GGML_TYPE_F16) {
+ is_f16 = true;
+ } else if (kv_tensor.second->type != GGML_TYPE_F32) {
+ fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
+ "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
+ llama_free(ctx);
+ llama_model_free(model);
+ return 1;
+ }
+ included_layers++;
+ max_nelements = std::max(max_nelements, ggml_nelements(kv_tensor.second));
+ }
+
+ if (is_f16) {
+ printf("note: source model is f16\n");
+ }
+ printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements);
+ // allocate scratch space
+ std::vector<float> input_scratch;
+ std::vector<char> quantized_scratch;
+ std::vector<float> output_scratch;
+
+ // loop throught quantization types
+ for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+ const ggml_type type = (ggml_type) i;
+ if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
+ continue;
+ }
+ const auto * qfns = ggml_get_type_traits(type);
+ const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
+ if (qfns_cpu->from_float && qfns->to_float) {
+ if (params.verbose) {
+ printf("testing %s ...\n", ggml_type_name(type));
+ }
+
+ ggml_quantize_init(type);
+
+ error_stats global_stats {};
+
+ for (const auto & kv_tensor : tensors) {
+ if (!layer_included(params, kv_tensor.first)) {
+ continue;
+ }
+ if (params.verbose) {
+ printf(" %s ...\n", kv_tensor.first.c_str());
+ }
+ std::string layer_name { ggml_type_name(type) };
+ layer_name += "::" + kv_tensor.first;
+ test_roundtrip_on_layer(
+ layer_name,
+ params.per_layer_stats,
+ *qfns, *qfns_cpu,
+ params.reference,
+ kv_tensor.second,
+ input_scratch,
+ quantized_scratch,
+ output_scratch,
+ global_stats,
+ max_thread
+ );
+ }
+
+ print_error_stats(ggml_type_name(type), global_stats, params.print_histogram);
+ }
+ }
+
+
+ llama_free(ctx);
+ llama_model_free(model);
+ // report timing
+ {
+ const int64_t t_main_end_us = ggml_time_us();
+
+ printf("\n");
+ printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
+ }
+
+ return 0;
+}
#include "llama.h"
#include "common.h"
-#include "unicode.h"
#include "console.h"
+#include "../src/unicode.h"
+
#include <cassert>
#include <codecvt>
#include <cstdio>
#include "llama.h"
#include "common.h"
-#include "unicode.h"
#include "console.h"
+#include "../src/unicode.h"
+
#include <cassert>
#include <codecvt>
#include <cstdio>