add_subdirectory(batched)
add_subdirectory(embedding)
add_subdirectory(eval-callback)
- add_subdirectory(gbnf-validator)
+
+ if (NOT WIN32)
+ # disabled on Windows because it uses internal functions not exported with LLAMA_API
+ add_subdirectory(gbnf-validator)
+ endif()
+
add_subdirectory(gguf-hash)
add_subdirectory(gguf-split)
add_subdirectory(gguf)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(cvector-generator)
add_subdirectory(export-lora)
- add_subdirectory(quantize-stats)
+ if (NOT WIN32)
+ # disabled on Windows because it uses internal functions not exported with LLAMA_API
+ add_subdirectory(quantize-stats)
+ endif()
add_subdirectory(llava)
if (GGML_RPC)
add_subdirectory(rpc)
}
void print_info() {
- printf("n_split: %ld\n", ctx_outs.size());
+ printf("n_split: %zu\n", ctx_outs.size());
int i_split = 0;
for (auto & ctx_out : ctx_outs) {
// re-calculate the real gguf size for each split (= metadata size + total size of all tensors)
total_size += ggml_nbytes(t);
}
total_size = total_size / 1000 / 1000; // convert to megabytes
- printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
+ printf("split %05d: n_tensors = %d, total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
i_split++;
}
}
for (const auto & inst : params_instances) {
params_idx++;
if (params.progress) {
- fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count);
+ fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count);
}
// keep the same model between tests when possible
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
// warmup run
if (t.n_prompt > 0) {
if (params.progress) {
- fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
+ fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);
}
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
}
if (t.n_gen > 0) {
if (params.progress) {
- fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
+ fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
}
test_gen(ctx, 1, t.n_threads);
}
if (t.n_prompt > 0) {
if (params.progress) {
- fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count,
+ fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count,
i + 1, params.reps);
}
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
}
if (t.n_gen > 0) {
if (params.progress) {
- fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count,
+ fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,
i + 1, params.reps);
}
test_gen(ctx, t.n_gen, t.n_threads);
std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
}
- LOG_INF("Number of chunks: %ld\n", chunks.size());
+ LOG_INF("Number of chunks: %zu\n", chunks.size());
llama_backend_init();
llama_numa_init(params.numa);
}
if (show_token_count) {
- printf("Total number of tokens: %ld\n", tokens.size());
+ printf("Total number of tokens: %zu\n", tokens.size());
}
// silence valgrind
llama_free(ctx);
endif()
endif()
+# remove the lib prefix on win32 mingw
+if (WIN32)
+ set(CMAKE_STATIC_LIBRARY_PREFIX "")
+ set(CMAKE_SHARED_LIBRARY_PREFIX "")
+ set(CMAKE_SHARED_MODULE_PREFIX "")
+endif()
+
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
if (WIN32)
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
-
- if (BUILD_SHARED_LIBS)
- # TODO: should not use this
- set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
- endif()
endif()
# ggml
}
static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
- void * data = aligned_alloc(TENSOR_ALIGNMENT, size);
+ void * data = ggml_aligned_malloc(size);
if (data == NULL) {
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
return NULL;
#endif
#include <windows.h>
-
-#if !defined(__clang__)
+#if defined(_MSC_VER) && !defined(__clang__)
#define GGML_CACHE_ALIGN __declspec(align(GGML_CACHE_LINE))
typedef volatile LONG atomic_int;
#include "windows.h"
// TODO: support > 64 CPUs
-bool ggml_thread_apply_affinity(bool * mask) {
+static bool ggml_thread_apply_affinity(bool * mask) {
HANDLE h = GetCurrentThread();
uint64_t bitmask = 0ULL;
//
GGML_ATTRIBUTE_FORMAT(2, 3)
-void ggml_log_internal (enum ggml_log_level level, const char * format, ...);
-void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data);
+GGML_API void ggml_log_internal (enum ggml_log_level level, const char * format, ...);
+GGML_API void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data);
#define GGML_LOG(...) ggml_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
#define GGML_LOG_INFO(...) ggml_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
// Memory allocation
-void * ggml_aligned_malloc(size_t size);
-void ggml_aligned_free(void * ptr, size_t size);
+GGML_API void * ggml_aligned_malloc(size_t size);
+GGML_API void ggml_aligned_free(void * ptr, size_t size);
// FP16 to FP32 conversion
#pragma once
+#include "ggml.h"
+
#ifdef __cplusplus
extern "C" {
#endif
-void ggml_critical_section_start(void);
-void ggml_critical_section_end(void);
+GGML_API void ggml_critical_section_start(void);
+GGML_API void ggml_critical_section_end(void);
#ifdef __cplusplus
}
-# TODO: should not use this
-if (WIN32)
- if (BUILD_SHARED_LIBS)
- set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
- endif()
-endif()
-
llama_add_compile_flags()
#
DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
if (!bufLen) {
- ret = format("Win32 error code: %s", error_code);
+ ret = format("Win32 error code: %lx", error_code);
} else {
ret = lpMsgBuf;
LocalFree(lpMsgBuf);
HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
// may fail on pre-Windows 8 systems
- pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
+ pPrefetchVirtualMemory = (decltype(pPrefetchVirtualMemory))(void *) GetProcAddress(hKernel32, "PrefetchVirtualMemory");
if (pPrefetchVirtualMemory) {
// advise the kernel to preload the mapped memory
throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
}
} else if ((size_t) i >= ctx->output_ids.size()) {
- throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
+ throw std::runtime_error(format("out of range [0, %zu)", ctx->output_ids.size()));
} else {
j = ctx->output_ids[i];
}
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
-# build test-tokenizer-1-bpe target once and add many tests
-add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
-target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
-install(TARGETS test-tokenizer-1-bpe RUNTIME)
-
-# TODO: disabled due to slowness
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
-
-# build test-tokenizer-1-spm target once and add many tests
-add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp)
-target_link_libraries(test-tokenizer-1-spm PRIVATE common)
-install(TARGETS test-tokenizer-1-spm RUNTIME)
-
-llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
-#llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
-
-# llama_target_and_test(test-double-float.cpp) # SLOW
+
+if (NOT WIN32)
+ # these tests are disabled on Windows because they use internal functions not exported with LLAMA_API
+ llama_target_and_test(test-sampling.cpp)
+ llama_target_and_test(test-grammar-parser.cpp)
+ llama_target_and_test(test-grammar-integration.cpp)
+ llama_target_and_test(test-llama-grammar.cpp)
+ # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
+ if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
+ llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
+ target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
+ endif()
+
+
+ # build test-tokenizer-1-bpe target once and add many tests
+ add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
+ target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
+ install(TARGETS test-tokenizer-1-bpe RUNTIME)
+
+ # TODO: disabled due to slowness
+ #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
+ #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
+ #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
+ #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
+ #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
+ #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
+ #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
+ #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
+
+ # build test-tokenizer-1-spm target once and add many tests
+ add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp)
+ target_link_libraries(test-tokenizer-1-spm PRIVATE common)
+ install(TARGETS test-tokenizer-1-spm RUNTIME)
+
+ llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
+ #llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
+
+ # llama_target_and_test(test-double-float.cpp) # SLOW
+endif()
+
llama_target_and_test(test-log.cpp)
llama_target_and_test(test-arg-parser.cpp)
-llama_target_and_test(test-sampling.cpp)
llama_target_and_test(test-chat-template.cpp)
-llama_target_and_test(test-grammar-parser.cpp)
-llama_target_and_test(test-grammar-integration.cpp)
-llama_target_and_test(test-llama-grammar.cpp)
# llama_target_and_test(test-opt.cpp) # SLOW
llama_target_and_test(test-backend-ops.cpp)
llama_target_and_test(test-rope.cpp)
endif()
-# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
-if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
- llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
- target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
-endif()
# dummy executable - not installed
get_filename_component(TEST_TARGET test-c.c NAME_WE)