#
# keep standard at C11 and C++11
-MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon
+MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU
MK_CFLAGS = -std=c11 -fPIC
MK_CXXFLAGS = -std=c++11 -fPIC
MK_NVCCFLAGS = -std=c++11
# some memory allocation are available on Linux through GNU extensions in libc
ifeq ($(UNAME_S),Linux)
MK_CPPFLAGS += -D_GNU_SOURCE
+ MK_LDFLAGS += -ldl
endif
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
cSettings.append(
contentsOf: [
.define("GGML_USE_ACCELERATE"),
- .define("GGML_USE_METAL")
+ .define("GGML_USE_METAL"),
+ .define("GGML_USE_CPU")
]
)
#endif
#endif
LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
+
+ // load dynamic backends
+ ggml_backend_load_all();
}
std::string common_params_get_system_info(const common_params & params) {
if (EMSCRIPTEN)
else()
- add_subdirectory(cvector-generator)
add_subdirectory(batched-bench)
add_subdirectory(batched)
- add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(embedding)
add_subdirectory(eval-callback)
- add_subdirectory(export-lora)
add_subdirectory(gbnf-validator)
add_subdirectory(gguf-hash)
add_subdirectory(gguf-split)
add_subdirectory(imatrix)
add_subdirectory(infill)
add_subdirectory(llama-bench)
- add_subdirectory(llava)
add_subdirectory(lookahead)
add_subdirectory(lookup)
add_subdirectory(main)
add_subdirectory(parallel)
add_subdirectory(passkey)
add_subdirectory(perplexity)
- add_subdirectory(quantize-stats)
add_subdirectory(quantize)
add_subdirectory(retrieval)
- if (GGML_RPC)
- add_subdirectory(rpc)
- endif()
if (LLAMA_BUILD_SERVER)
- add_subdirectory(server)
- endif()
- if (GGML_SYCL)
- add_subdirectory(sycl)
+ add_subdirectory(server)
endif()
add_subdirectory(save-load-state)
add_subdirectory(simple)
add_subdirectory(speculative)
add_subdirectory(speculative-simple)
add_subdirectory(tokenize)
+ if (NOT GGML_BACKEND_DL)
+ # these examples use the backends directly and cannot be built with dynamic loading
+ add_subdirectory(convert-llama2c-to-ggml)
+ add_subdirectory(cvector-generator)
+ add_subdirectory(export-lora)
+ add_subdirectory(quantize-stats)
+ add_subdirectory(llava)
+ if (GGML_RPC)
+ add_subdirectory(rpc)
+ endif()
+ if (GGML_SYCL)
+ add_subdirectory(sycl)
+ endif()
+ endif()
endif()
target_compile_features(${TARGET} PRIVATE cxx_std_11)
set(TEST_TARGET test-eval-callback)
-add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
+add_test(NAME ${TEST_TARGET}
+ COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
cmd_params params = parse_cmd_params(argc, argv);
+ // initialize backends
+ ggml_backend_load_all();
+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+ if (!cpu_dev) {
+ fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);
+ return 1;
+ }
+ auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
+ auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_new");
+ auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_free");
+
// initialize llama.cpp
if (!params.verbose) {
llama_log_set(llama_null_log_callback, NULL);
tpp.poll = t.poll;
tpp.prio = params.prio;
- struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
+ struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
if (!threadpool) {
fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
exit(1);
llama_free(ctx);
- ggml_threadpool_free(threadpool);
+ ggml_threadpool_free_fn(threadpool);
}
llama_free_model(lmodel);
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
+ auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
+ auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new");
+ auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free");
+
struct ggml_threadpool_params tpp_batch =
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
struct ggml_threadpool_params tpp =
struct ggml_threadpool * threadpool_batch = NULL;
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
- threadpool_batch = ggml_threadpool_new(&tpp_batch);
+ threadpool_batch = ggml_threadpool_new_fn(&tpp_batch);
if (!threadpool_batch) {
LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
return 1;
tpp.paused = true;
}
- struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
+ struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
if (!threadpool) {
LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
return 1;
llama_backend_free();
- ggml_threadpool_free(threadpool);
- ggml_threadpool_free(threadpool_batch);
+ ggml_threadpool_free_fn(threadpool);
+ ggml_threadpool_free_fn(threadpool_batch);
return 0;
}
}
}, nullptr);
+ // load dynamic backends
+ ggml_backend_load_all();
+
// initialize the model
llama_model_params model_params = llama_model_default_params();
model_params.n_gpu_layers = ngl;
}
}
+ // load dynamic backends
+
+ ggml_backend_load_all();
+
// initialize the model
llama_model_params model_params = llama_model_default_params();
endif()
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
+option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
#
# option list
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
// Get additional buffer types provided by the device (returns a NULL-terminated array)
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
+ // Set the abort callback for the backend
+ typedef void (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data);
+ // Get a list of feature flags supported by the backend (returns a NULL-terminated array)
+ struct ggml_backend_feature {
+ const char * name;
+ const char * value;
+ };
+ typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg);
//
// Backend registry
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
GGML_API ggml_backend_t ggml_backend_init_best(void);
+ // Load a backend from a dynamic library and register it
+ GGML_API ggml_backend_reg_t ggml_backend_load(const char * path);
+ // Unload a backend if loaded dynamically and unregister it
+ GGML_API void ggml_backend_unload(ggml_backend_reg_t reg);
+ // Load all known backends from dynamic libraries
+ GGML_API void ggml_backend_load_all(void);
+
//
// Backend scheduler
//
extern "C" {
#endif
- // Scheduling priorities
- enum ggml_sched_priority {
- GGML_SCHED_PRIO_NORMAL,
- GGML_SCHED_PRIO_MEDIUM,
- GGML_SCHED_PRIO_HIGH,
- GGML_SCHED_PRIO_REALTIME
- };
-
- // Threadpool params
- // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
- struct ggml_threadpool_params {
- bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
- int n_threads; // number of threads
- enum ggml_sched_priority prio; // thread priority
- uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
- bool strict_cpu; // strict cpu placement
- bool paused; // start in paused state
- };
-
- struct ggml_threadpool; // forward declaration, see ggml.c
-
- typedef struct ggml_threadpool * ggml_threadpool_t;
-
// the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggerganov/ggml/issues/287
struct ggml_cplan {
GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
- GGML_BACKEND_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
- GGML_BACKEND_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
- GGML_BACKEND_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
- GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
- GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
- GGML_BACKEND_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
- GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
- GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
+ GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
+ GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
+ GGML_BACKEND_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
+ GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
+ GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_BACKEND_API int ggml_cpu_has_sse3 (void);
GGML_BACKEND_API int ggml_cpu_has_ssse3 (void);
GGML_BACKEND_API int ggml_cpu_has_avx (void);
+ GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
GGML_BACKEND_API int ggml_cpu_has_avx2 (void);
GGML_BACKEND_API int ggml_cpu_has_f16c (void);
GGML_BACKEND_API int ggml_cpu_has_fma (void);
- GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
GGML_BACKEND_API int ggml_cpu_has_avx512 (void);
GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
+ // ggml threadpool
+ // TODO: currently, only a few functions are in the base ggml API, while the rest are in the CPU backend
+ // the goal should be to create an API that other backends can use move everything to the ggml base
+
+ // scheduling priorities
+ enum ggml_sched_priority {
+ GGML_SCHED_PRIO_NORMAL,
+ GGML_SCHED_PRIO_MEDIUM,
+ GGML_SCHED_PRIO_HIGH,
+ GGML_SCHED_PRIO_REALTIME
+ };
+
+ // threadpool params
+ // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
+ struct ggml_threadpool_params {
+ bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
+ int n_threads; // number of threads
+ enum ggml_sched_priority prio; // thread priority
+ uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
+ bool strict_cpu; // strict cpu placement
+ bool paused; // start in paused state
+ };
+
+ struct ggml_threadpool; // forward declaration, see ggml.c
+
+ typedef struct ggml_threadpool * ggml_threadpool_t;
+
+ GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
+ GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
+ GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
+
#ifdef __cplusplus
}
#endif
# ggml
+if (GGML_BACKEND_DL AND NOT BUILD_SHARED_LIBS)
+ message(FATAL_ERROR "GGML_BACKEND_DL requires BUILD_SHARED_LIBS")
+endif()
+
add_library(ggml-base
../include/ggml.h
../include/ggml-alloc.h
target_link_libraries(ggml PUBLIC ggml-base)
+if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+ target_link_libraries(ggml PRIVATE dl)
+endif()
+
+function(ggml_add_backend_library backend)
+ if (GGML_BACKEND_DL)
+ add_library(${backend} MODULE ${ARGN})
+ # write the shared library to the output directory
+ set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+ target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
+ else()
+ add_library(${backend} ${ARGN})
+ target_link_libraries(ggml PUBLIC ${backend})
+ install(TARGETS ${backend} LIBRARY)
+ endif()
+
+ target_link_libraries(${backend} PRIVATE ggml-base)
+ target_include_directories(${backend} PRIVATE ..)
+
+ if (${BUILD_SHARED_LIBS})
+ target_compile_definitions(${backend} PRIVATE GGML_BACKEND_BUILD)
+ target_compile_definitions(${backend} PUBLIC GGML_BACKEND_SHARED)
+ endif()
+endfunction()
+
function(ggml_add_backend backend)
string(TOUPPER "GGML_${backend}" backend_id)
if (${backend_id})
# however, currently it is necessary for AMX, since it is enabled by default on llama.cpp
if (${backend_id})
message(STATUS "Including ${backend} backend")
- if (${BUILD_SHARED_LIBS})
- target_compile_definitions(${backend_target} PRIVATE GGML_BACKEND_BUILD)
- target_compile_definitions(${backend_target} PUBLIC GGML_BACKEND_SHARED)
+ if (NOT GGML_BACKEND_DL)
+ string(TOUPPER "GGML_USE_${backend}" backend_use)
+ target_compile_definitions(ggml PUBLIC ${backend_use})
endif()
- install(TARGETS ${backend_target} LIBRARY)
- target_link_libraries(ggml PUBLIC ${backend_target})
- string(TOUPPER "GGML_USE_${backend}" backend_use)
- target_compile_definitions(ggml PUBLIC ${backend_use})
endif()
endif()
endfunction()
ggml_add_backend(HIP)
ggml_add_backend(Kompute)
ggml_add_backend(METAL)
+ggml_add_backend(MUSA)
ggml_add_backend(RPC)
ggml_add_backend(SYCL)
ggml_add_backend(Vulkan)
-ggml_add_backend(MUSA)
foreach (target ggml-base ggml)
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
file(GLOB GGML_SOURCES_AMX "*.cpp")
- add_library(ggml-amx
- ${GGML_HEADERS_AMX}
- ${GGML_SOURCES_AMX})
-
- target_link_libraries(ggml-amx PRIVATE ggml-base)
- target_include_directories(ggml-amx PRIVATE . ..)
+ ggml_add_backend_library(ggml-amx
+ ${GGML_HEADERS_AMX}
+ ${GGML_SOURCES_AMX}
+ )
# this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags
# TODO: integrate AMX backend into the CPU backend
ggml_backend_reg_t ggml_backend_amx_reg(void) {
static struct ggml_backend_reg ggml_backend_amx_reg = {
- /* .iface = */ ggml_backend_amx_reg_i,
- /* .context = */ NULL,
+ /* .api_version = */ GGML_BACKEND_API_VERSION,
+ /* .iface = */ ggml_backend_amx_reg_i,
+ /* .context = */ NULL,
};
return &ggml_backend_amx_reg;
}
#endif
+
+GGML_BACKEND_DL_IMPL(ggml_backend_amx_reg)
extern "C" {
#endif
+ #define GGML_BACKEND_API_VERSION 1
+
//
// Backend buffer type
//
enum ggml_backend_buffer_usage usage;
};
- ggml_backend_buffer_t ggml_backend_buffer_init(
+ GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
ggml_backend_buffer_type_t buft,
struct ggml_backend_buffer_i iface,
void * context,
size_t size);
// do not use directly, use ggml_backend_tensor_copy instead
- bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
+ GGML_API bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
// multi-buffer
// buffer that contains a collection of buffers
- ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
- bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
- void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
+ GGML_API ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
+ GGML_API bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
+ GGML_API void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
//
// Backend (stream)
};
struct ggml_backend_reg {
- // int api_version; // TODO: for dynamic loading
+ int api_version; // initialize to GGML_BACKEND_API_VERSION
struct ggml_backend_reg_i iface;
void * context;
};
-
// Internal backend registry API
- void ggml_backend_register(ggml_backend_reg_t reg);
- void ggml_backend_device_register(ggml_backend_dev_t device);
- // TODO: backends can be loaded as a dynamic library, in which case it needs to export this function
- // typedef ggml_backend_register_t * (*ggml_backend_init)(void);
+ GGML_API void ggml_backend_register(ggml_backend_reg_t reg);
+ GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
+
+ // Add backend dynamic loading support to the backend
+ typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
+
+ #ifdef GGML_BACKEND_DL
+ #ifdef __cplusplus
+ # define GGML_BACKEND_DL_IMPL(reg_fn) \
+ extern "C" { \
+ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
+ } \
+ ggml_backend_reg_t ggml_backend_init(void) { \
+ return reg_fn(); \
+ }
+ #else
+ # define GGML_BACKEND_DL_IMPL(reg_fn) \
+ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
+ ggml_backend_reg_t ggml_backend_init(void) { \
+ return reg_fn(); \
+ }
+ #endif
+ #else
+ # define GGML_BACKEND_DL_IMPL(reg_fn)
+ #endif
#ifdef __cplusplus
}
#include "ggml-backend-impl.h"
#include "ggml-backend.h"
-#include "ggml-cpu.h"
#include "ggml-impl.h"
+#include <algorithm>
#include <cstring>
+#include <string>
#include <vector>
+#ifdef _WIN32
+# define WIN32_LEAN_AND_MEAN
+# ifndef NOMINMAX
+# define NOMINMAX
+# endif
+# include <windows.h>
+#elif defined(__APPLE__)
+# include <mach-o/dyld.h>
+# include <dlfcn.h>
+#else
+# include <dlfcn.h>
+# include <unistd.h>
+#endif
+
// Backend registry
+#ifdef GGML_USE_CPU
+#include "ggml-cpu.h"
+#endif
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#include "ggml-kompute.h"
#endif
+struct ggml_backend_reg_entry {
+ ggml_backend_reg_t reg;
+ void * handle;
+};
+
struct ggml_backend_registry {
- std::vector<ggml_backend_reg_t> backends;
+ std::vector<ggml_backend_reg_entry> backends;
std::vector<ggml_backend_dev_t> devices;
ggml_backend_registry() {
#ifdef GGML_USE_KOMPUTE
register_backend(ggml_backend_kompute_reg());
#endif
-
+#ifdef GGML_USE_CPU
register_backend(ggml_backend_cpu_reg());
+#endif
}
- void register_backend(ggml_backend_reg_t reg) {
+ ~ggml_backend_registry() {
+ while (!backends.empty()) {
+ // use silent since the log system may have been destroyed at this point
+ unload_backend(backends.back().reg, true);
+ }
+ }
+
+ void register_backend(ggml_backend_reg_t reg, void * handle = nullptr) {
if (!reg) {
return;
}
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
#endif
- backends.push_back(reg);
+ backends.push_back({ reg, handle });
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
register_device(ggml_backend_reg_dev_get(reg, i));
}
#endif
devices.push_back(device);
}
+
+ ggml_backend_reg_t load_backend(const char * path, bool silent) {
+#ifdef _WIN32
+ // suppress error dialogs for missing DLLs
+ DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
+ SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
+
+ HMODULE handle = LoadLibraryA(path);
+
+ if (!handle) {
+ if (!silent) {
+ GGML_LOG_ERROR("%s: failed to load %s: %lu\n", __func__, path, GetLastError());
+ }
+ SetErrorMode(old_mode);
+ return nullptr;
+ }
+
+ ggml_backend_init_t backend_init = (ggml_backend_init_t) GetProcAddress(handle, "ggml_backend_init");
+
+ SetErrorMode(old_mode);
+
+ if (!backend_init) {
+ if (!silent) {
+ GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %lu\n", __func__, path, GetLastError());
+ }
+ FreeLibrary(handle);
+ return nullptr;
+ }
+#else
+ void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL);
+
+ if (!handle) {
+ if (!silent) {
+ GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path, dlerror());
+ }
+ return nullptr;
+ }
+
+ auto * backend_init = (ggml_backend_init_t) dlsym(handle, "ggml_backend_init");
+
+ if (!backend_init) {
+ if (!silent) {
+ GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror());
+ }
+ dlclose(handle);
+ return nullptr;
+ }
+#endif
+ ggml_backend_reg_t reg = backend_init();
+
+ if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
+ if (!silent) {
+ if (!reg) {
+ GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path);
+ } else {
+ GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
+ __func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
+ }
+ }
+#ifdef _WIN32
+ FreeLibrary(handle);
+#else
+ dlclose(handle);
+#endif
+ return nullptr;
+ }
+
+ GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
+ register_backend(reg, handle);
+ return reg;
+ }
+
+ void unload_backend(ggml_backend_reg_t reg, bool silent) {
+ auto it = std::find_if(backends.begin(), backends.end(),
+ [reg](ggml_backend_reg_entry entry) { return entry.reg == reg; });
+
+ if (it == backends.end()) {
+ if (!silent) {
+ GGML_LOG_ERROR("%s: backend not found\n", __func__);
+ }
+ return;
+ }
+
+ if (!silent) {
+ GGML_LOG_DEBUG("%s: unloading %s backend\n", __func__, ggml_backend_reg_name(reg));
+ }
+
+ // remove devices
+ devices.erase(
+ std::remove_if(devices.begin(), devices.end(),
+ [reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }),
+ devices.end());
+
+ // unload library
+ if (it->handle) {
+#ifdef _WIN32
+ FreeLibrary((HMODULE) it->handle);
+#else
+ dlclose(it->handle);
+#endif
+ }
+
+ // remove backend
+ backends.erase(it);
+ }
};
static ggml_backend_registry & get_reg() {
ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
GGML_ASSERT(index < ggml_backend_reg_count());
- return get_reg().backends[index];
+ return get_reg().backends[index].reg;
}
ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
return reg;
}
}
- return NULL;
+ return nullptr;
}
// Device enumeration
return dev;
}
}
- return NULL;
+ return nullptr;
}
ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
return dev;
}
}
- return NULL;
+ return nullptr;
}
// Convenience functions
ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
if (!dev) {
- return NULL;
+ return nullptr;
}
return ggml_backend_dev_init(dev, params);
}
ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
if (!dev) {
- return NULL;
+ return nullptr;
}
return ggml_backend_dev_init(dev, params);
}
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
}
if (!dev) {
- return NULL;
+ return nullptr;
}
- return ggml_backend_dev_init(dev, NULL);
+ return ggml_backend_dev_init(dev, nullptr);
+}
+
+// Dynamic loading
+ggml_backend_reg_t ggml_backend_load(const char * path) {
+ return get_reg().load_backend(path, false);
+}
+
+void ggml_backend_unload(ggml_backend_reg_t reg) {
+ get_reg().unload_backend(reg, true);
+}
+
+void ggml_backend_load_all() {
+ std::vector<std::string> search_prefix;
+
+ // add the executable directory to the search path
+ // FIXME: this is convenient for development, but it should probably be disabled in production
+
+#if defined(__APPLE__)
+ // get executable path
+ std::vector<char> path;
+ uint32_t size;
+ while (true) {
+ size = path.size();
+ if (_NSGetExecutablePath(path.data(), &size) == 0) {
+ break;
+ }
+ path.resize(size);
+ }
+ std::string base_path(path.data(), size);
+ // remove executable name
+ auto last_slash = base_path.find_last_of('/');
+ if (last_slash != std::string::npos) {
+ base_path = base_path.substr(0, last_slash);
+ }
+ search_prefix.push_back(base_path + "/");
+#elif defined(__linux__)
+ std::string base_path = ".";
+ std::vector<char> path(1024);
+ while (true) {
+ // get executable path
+ ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
+ if (len == -1) {
+ break;
+ }
+ if (len < (ssize_t) path.size()) {
+ base_path = std::string(path.data(), len);
+ // remove executable name
+ auto last_slash = base_path.find_last_of('/');
+ if (last_slash != std::string::npos) {
+ base_path = base_path.substr(0, last_slash);
+ }
+ break;
+ }
+ path.resize(path.size() * 2);
+ }
+
+ search_prefix.push_back(base_path + "/");
+#endif
+
+ auto & reg = get_reg();
+
+ auto try_load = [&](const std::string & name) {
+ std::string os_name;
+#ifdef _WIN32
+ os_name = "ggml-" + name + ".dll";
+#else
+ os_name = "libggml-" + name + ".so";
+#endif
+ if (reg.load_backend(os_name.c_str(), true)) {
+ return;
+ }
+ for (const auto & prefix : search_prefix) {
+ if (reg.load_backend((prefix + os_name).c_str(), true)) {
+ return;
+ }
+ }
+ };
+
+ try_load("amx");
+ try_load("blas");
+ try_load("cann");
+ try_load("cuda");
+ try_load("hip");
+ try_load("kompute");
+ try_load("metal");
+ try_load("rpc");
+ try_load("sycl");
+ try_load("vulkan");
+ try_load("musa");
+ try_load("cpu");
}
if (BLAS_FOUND)
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
- add_library(ggml-blas
- ggml-blas.cpp
- )
-
- target_link_libraries(ggml-blas PRIVATE ggml-base)
- target_include_directories(ggml-blas PRIVATE . ..)
+ ggml_add_backend_library(ggml-blas
+ ggml-blas.cpp
+ )
if (${GGML_BLAS_VENDOR} MATCHES "Apple")
add_compile_definitions(ACCELERATE_NEW_LAPACK)
ggml_backend_reg_t ggml_backend_blas_reg(void) {
static struct ggml_backend_reg ggml_backend_blas_reg = {
- /* .iface = */ ggml_backend_blas_reg_i,
- /* .context = */ NULL,
+ /* .api_version = */ GGML_BACKEND_API_VERSION,
+ /* .iface = */ ggml_backend_blas_reg_i,
+ /* .context = */ NULL,
};
return &ggml_backend_blas_reg;
}
+
+GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg)
file(GLOB GGML_SOURCES_CANN "*.cpp")
- add_library(ggml-cann ${GGML_SOURCES_CANN})
- target_link_libraries(ggml-cann PRIVATE ggml-base ${CANN_LIBRARIES})
- target_include_directories(ggml-cann PRIVATE . .. ${CANN_INCLUDE_DIRS})
+ ggml_add_backend_library(ggml-cann ${GGML_SOURCES_CANN})
+ target_link_libraries(ggml-cann PRIVATE ${CANN_LIBRARIES})
+ target_include_directories(ggml-cann PRIVATE ${CANN_INCLUDE_DIRS})
target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64)
target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
ggml_cann_set_device(i);
ggml_backend_dev_t dev = new ggml_backend_device {
- /* .interface = */ ggml_backend_cann_device_interface,
- /* .reg = */ ®,
- /* .context = */ dev_ctx
+ /* .iface = */ ggml_backend_cann_device_interface,
+ /* .reg = */ ®,
+ /* .context = */ dev_ctx
};
ctx->devices.push_back(dev);
}
reg = ggml_backend_reg {
- /* .interface = */ ggml_backend_cann_reg_interface,
- /* .context = */ ctx
+ /* .api_version = */ GGML_BACKEND_API_VERSION,
+ /* .iface = */ ggml_backend_cann_reg_interface,
+ /* .context = */ ctx
};
}
ggml_cann_set_device(device);
ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
}
+
+GGML_BACKEND_DL_IMPL(ggml_backend_cann_reg)
-add_library(ggml-cpu
- ggml-cpu.c
- ggml-cpu.cpp
- ggml-cpu-aarch64.c
- ggml-cpu-aarch64.h
- ggml-cpu-quants.c
- ggml-cpu-quants.h
- )
+ggml_add_backend_library(ggml-cpu
+ ggml-cpu.c
+ ggml-cpu.cpp
+ ggml-cpu-aarch64.c
+ ggml-cpu-aarch64.h
+ ggml-cpu-quants.c
+ ggml-cpu-quants.h
+ )
-target_link_libraries(ggml-cpu PRIVATE ggml-base)
-target_include_directories(ggml-cpu PRIVATE . ..)
+target_include_directories(ggml-cpu PRIVATE .)
if (APPLE AND GGML_ACCELERATE)
find_library(ACCELERATE_FRAMEWORK Accelerate)
#endif // GGML_USE_OPENMP
-void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
- p->n_threads = n_threads;
- p->prio = 0; // default priority (usually means normal or inherited)
- p->poll = 50; // hybrid-polling enabled
- p->strict_cpu = false; // no strict placement (all threads share same cpumask)
- p->paused = false; // threads are ready to go
- memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
-}
-
-struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
- struct ggml_threadpool_params p;
- ggml_threadpool_params_init(&p, n_threads);
- return p;
-}
-
-bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
- if (p0->n_threads != p1->n_threads ) return false;
- if (p0->prio != p1->prio ) return false;
- if (p0->poll != p1->poll ) return false;
- if (p0->strict_cpu != p1->strict_cpu ) return false;
- return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
-}
-
static struct ggml_threadpool * ggml_threadpool_new_impl(
struct ggml_threadpool_params * tpp,
struct ggml_cgraph * cgraph,
return &ggml_backend_cpu_device;
}
-struct ggml_backend_feature {
- const char * name;
- const char * value;
-};
-
-// Not used yet
// This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically,
-// and additionally to allow other backends to expose their own list of features that applications can query using the same API.
+// and additionally to allow other backends to expose their own list of features that applications can query using the same API
static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) {
static std::vector<ggml_backend_feature> features = []() {
+ ggml_cpu_init();
+
std::vector<ggml_backend_feature> features;
if (ggml_cpu_has_sse3()) {
features.push_back({ "SSE3", "1" });
if (ggml_cpu_has_avx()) {
features.push_back({ "AVX", "1" });
}
+ if (ggml_cpu_has_avx_vnni()) {
+ features.push_back({ "AVX_VNNI", "1" });
+ }
if (ggml_cpu_has_avx2()) {
features.push_back({ "AVX2", "1" });
}
if (ggml_cpu_has_fma()) {
features.push_back({ "FMA", "1" });
}
- if (ggml_cpu_has_avx_vnni()) {
- features.push_back({ "AVX_VNNI", "1" });
- }
if (ggml_cpu_has_avx512()) {
features.push_back({ "AVX512", "1" });
}
if (ggml_cpu_has_llamafile()) {
features.push_back({ "LLAMAFILE", "1" });
}
+ // TODO: rename this
+ #ifdef GGML_USE_CPU_AARCH64
+ features.push_back({ "AARCH64_REPACK", "1" });
+ #endif
features.push_back({ nullptr, nullptr });
if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
return (void *)ggml_backend_cpu_get_extra_bufts;
}
+ if (strcmp(name, "ggml_backend_get_features") == 0) {
+ return (void *)ggml_backend_cpu_get_features;
+ }
+ if (strcmp(name, "ggml_backend_set_abort_callback") == 0) {
+ return (void *)ggml_backend_cpu_set_abort_callback;
+ }
+ if (strcmp(name, "ggml_backend_cpu_numa_init") == 0) {
+ return (void *)ggml_numa_init;
+ }
+ if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) {
+ return (void *)ggml_is_numa;
+ }
+
+ // threadpool - TODO: move to ggml-base
+ if (strcmp(name, "ggml_threadpool_new") == 0) {
+ return (void *)ggml_threadpool_new;
+ }
+ if (strcmp(name, "ggml_threadpool_free") == 0) {
+ return (void *)ggml_threadpool_free;
+ }
+ if (strcmp(name, "ggml_backend_cpu_set_threadpool") == 0) {
+ return (void *)ggml_backend_cpu_set_threadpool;
+ }
return NULL;
ggml_cpu_init();
static struct ggml_backend_reg ggml_backend_cpu_reg = {
- /* .iface = */ ggml_backend_cpu_reg_i,
- /* .context = */ NULL,
+ /* .api_version = */ GGML_BACKEND_API_VERSION,
+ /* .iface = */ ggml_backend_cpu_reg_i,
+ /* .context = */ NULL,
};
return &ggml_backend_cpu_reg;
}
+
+GGML_BACKEND_DL_IMPL(ggml_backend_cpu_reg)
list(APPEND GGML_SOURCES_CUDA ${SRCS})
endif()
- add_library(ggml-cuda
- ${GGML_HEADERS_CUDA}
- ${GGML_SOURCES_CUDA}
- )
-
- target_link_libraries(ggml-cuda PRIVATE ggml-base)
- target_include_directories(ggml-cuda PRIVATE . ..)
+ ggml_add_backend_library(ggml-cuda
+ ${GGML_HEADERS_CUDA}
+ ${GGML_SOURCES_CUDA}
+ )
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
return ctx->devices[index];
}
+static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t reg) {
+ static std::vector<ggml_backend_feature> features = []() {
+ std::vector<ggml_backend_feature> features;
+ #define _STRINGIFY(...) #__VA_ARGS__
+ #define STRINGIFY(...) _STRINGIFY(__VA_ARGS__)
+
+ #ifdef __CUDA_ARCH_LIST__
+ features.push_back({ "ARCHS", STRINGIFY(__CUDA_ARCH_LIST__) });
+ #endif
+
+ #ifdef GGML_CUDA_FORCE_MMQ
+ features.push_back({ "FORCE_MMQ", "1" });
+ #endif
+
+ #ifdef GGML_CUDA_FORCE_CUBLAS
+ features.push_back({ "FORCE_CUBLAS", "1" });
+ #endif
+
+ #ifdef GGML_CUDA_NO_VMM
+ features.push_back({ "NO_VMM", "1" });
+ #endif
+
+ #ifdef GGML_CUDA_NO_PEER_COPY
+ features.push_back({ "NO_PEER_COPY", "1" });
+ #endif
+
+ #ifdef GGML_CUDA_F16
+ features.push_back({ "F16", "1" });
+ #endif
+
+ #ifdef GGML_CUDA_USE_GRAPHS
+ features.push_back({ "USE_GRAPHS", "1" });
+ #endif
+
+ #ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
+ features.push_back({ "PEER_MAX_BATCH_SIZE", STRINGIFY(GGML_CUDA_PEER_MAX_BATCH_SIZE) });
+ #endif
+
+ #ifdef GGML_CUDA_FA_ALL_QUANTS
+ features.push_back({ "FA_ALL_QUANTS", "1" });
+ #endif
+
+ #undef _STRINGIFY
+ #undef STRINGIFY
+
+ features.push_back({ nullptr, nullptr });
+
+ return features;
+ }();
+
+ return features.data();
+
+ GGML_UNUSED(reg);
+}
+
static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
GGML_UNUSED(reg);
if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) {
return (void *)ggml_backend_cuda_unregister_host_buffer;
}
+ if (strcmp(name, "ggml_backend_get_features") == 0) {
+ return (void *)ggml_backend_cuda_get_features;
+ }
return nullptr;
}
dev_ctx->description = prop.name;
ggml_backend_dev_t dev = new ggml_backend_device {
- /* .interface = */ ggml_backend_cuda_device_interface,
- /* .reg = */ ®,
- /* .context = */ dev_ctx
+ /* .iface = */ ggml_backend_cuda_device_interface,
+ /* .reg = */ ®,
+ /* .context = */ dev_ctx
};
ctx->devices.push_back(dev);
}
reg = ggml_backend_reg {
- /* .interface = */ ggml_backend_cuda_reg_interface,
- /* .context = */ ctx
+ /* .api_version = */ GGML_BACKEND_API_VERSION,
+ /* .iface = */ ggml_backend_cuda_reg_interface,
+ /* .context = */ ctx
};
}
return cuda_backend;
}
+
+GGML_BACKEND_DL_IMPL(ggml_backend_cuda_reg)
list(APPEND GGML_SOURCES_ROCM ${SRCS})
endif()
-add_library(ggml-hip
- ${GGML_HEADERS_ROCM}
- ${GGML_SOURCES_ROCM})
-
-target_link_libraries(ggml-hip PRIVATE ggml-base)
-target_include_directories(ggml-hip PRIVATE . ..)
+ggml_add_backend_library(ggml-hip
+ ${GGML_HEADERS_ROCM}
+ ${GGML_SOURCES_ROCM}
+ )
# TODO: do not use CUDA definitions for HIP
target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
message(FATAL_ERROR "glslc not found")
endif()
-add_library(ggml-kompute
- ggml-kompute.cpp
- ../../include/ggml-kompute.h
- )
+ggml_add_backend_library(ggml-kompute
+ ggml-kompute.cpp
+ ../../include/ggml-kompute.h
+ )
target_link_libraries(ggml-kompute PRIVATE ggml-base kompute)
-target_include_directories(ggml-kompute PRIVATE . .. ${CMAKE_CURRENT_BINARY_DIR})
+target_include_directories(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
ggml_backend_reg_t ggml_backend_kompute_reg() {
static ggml_backend_reg reg = {
- /* .iface = */ ggml_backend_kompute_reg_i,
- /* .context = */ nullptr,
+ /* .api_version = */ GGML_BACKEND_API_VERSION,
+ /* .iface = */ ggml_backend_kompute_reg_i,
+ /* .context = */ nullptr,
};
return ®
}
+
+GGML_BACKEND_DL_IMPL(ggml_backend_kompute_reg)
message(STATUS "Metal framework found")
-add_library(ggml-metal
- ggml-metal.m
- )
+ggml_add_backend_library(ggml-metal
+ ggml-metal.m
+ )
target_link_libraries(ggml-metal PRIVATE
- ggml-base
${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK}
${METALKIT_FRAMEWORK}
)
-target_include_directories(ggml-metal PRIVATE . ..)
-
if (GGML_METAL_NDEBUG)
add_compile_definitions(GGML_METAL_NDEBUG)
endif()
GGML_UNUSED(index);
}
+static struct ggml_backend_feature g_ggml_backend_metal_features[] = {
+#if defined(GGML_METAL_EMBED_LIBRARY)
+ { "EMBED_LIBRARY", "1" },
+#endif
+#if defined(GGML_METAL_USE_BF16)
+ { "BF16", "1" },
+#endif
+ { nil, nil },
+};
+
+static struct ggml_backend_feature * ggml_backend_metal_get_features(ggml_backend_reg_t reg) {
+ return g_ggml_backend_metal_features;
+
+ GGML_UNUSED(reg);
+}
+
+static void * ggml_backend_metal_get_proc_address(ggml_backend_reg_t reg, const char * name) {
+ if (strcmp(name, "ggml_backend_get_features") == 0) {
+ return (void *)ggml_backend_metal_get_features;
+ }
+
+ return NULL;
+
+ GGML_UNUSED(reg);
+}
static struct ggml_backend_reg_i ggml_backend_metal_reg_i = {
/* .get_name = */ ggml_backend_metal_reg_get_name,
/* .device_count = */ ggml_backend_metal_reg_device_count,
/* .device_get = */ ggml_backend_metal_reg_device_get,
- /* .get_proc_address = */ NULL,
+ /* .get_proc_address = */ ggml_backend_metal_get_proc_address,
};
ggml_backend_reg_t ggml_backend_metal_reg(void) {
// TODO: make this thread-safe somehow?
{
g_ggml_backend_metal_reg = (struct ggml_backend_reg) {
- /* .iface = */ ggml_backend_metal_reg_i,
- /* .context = */ NULL,
+ /* .api_version = */ GGML_BACKEND_API_VERSION,
+ /* .iface = */ ggml_backend_metal_reg_i,
+ /* .context = */ NULL,
};
g_ggml_backend_metal_device = (struct ggml_backend_device) {
return &g_ggml_backend_metal_reg;
}
+
+GGML_BACKEND_DL_IMPL(ggml_backend_metal_reg)
set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22")
endforeach()
- add_library(ggml-musa
- ${GGML_HEADERS_MUSA}
- ${GGML_SOURCES_MUSA})
-
- target_link_libraries(ggml-musa PRIVATE ggml-base)
- target_include_directories(ggml-musa PRIVATE . ..)
+ ggml_add_backend_library(ggml-musa
+ ${GGML_HEADERS_MUSA}
+ ${GGML_SOURCES_MUSA}
+ )
# TODO: do not use CUDA definitions for MUSA
target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
message(STATUS "Using RPC backend")
-add_library(ggml-rpc
- ggml-rpc.cpp)
-
-target_link_libraries(ggml-rpc PRIVATE ggml-base)
-target_include_directories(ggml-rpc PRIVATE . ..)
+ggml_add_backend_library(ggml-rpc
+ ggml-rpc.cpp
+ )
if (WIN32)
target_link_libraries(ggml-rpc PRIVATE ws2_32)
ggml_backend_reg_t ggml_backend_rpc_reg(void) {
static struct ggml_backend_reg ggml_backend_rpc_reg = {
- /* .iface = */ ggml_backend_rpc_reg_i,
- /* .context = */ NULL,
+ /* .api_version = */ GGML_BACKEND_API_VERSION,
+ /* .iface = */ ggml_backend_rpc_reg_i,
+ /* .context = */ NULL,
};
return &ggml_backend_rpc_reg;
return dev;
}
+
+GGML_BACKEND_DL_IMPL(ggml_backend_rpc_reg)
message(STATUS "SYCL found")
#todo: AOT
-add_library(ggml-sycl
- ggml-sycl.cpp
- ../../include/ggml-sycl.h)
-
-target_link_libraries(ggml-sycl PRIVATE ggml-base)
-target_include_directories(ggml-sycl PRIVATE . ..)
+ggml_add_backend_library(ggml-sycl
+ ggml-sycl.cpp
+ ../../include/ggml-sycl.h
+ )
if (GGML_SYCL_F16)
if (GGML_SYCL_TARGET STREQUAL "AMD")
dev_ctx->description = prop.get_name();
ggml_backend_dev_t dev = new ggml_backend_device {
- /* .interface = */ ggml_backend_sycl_device_interface,
- /* .reg = */ ®,
- /* .context = */ dev_ctx
+ /* .iface = */ ggml_backend_sycl_device_interface,
+ /* .reg = */ ®,
+ /* .context = */ dev_ctx
};
ctx->devices.push_back(dev);
}
reg = ggml_backend_reg {
- /* .interface = */ ggml_backend_sycl_reg_interface,
- /* .context = */ ctx
+ /* .api_version = */ GGML_BACKEND_API_VERSION,
+ /* .iface = */ ggml_backend_sycl_reg_interface,
+ /* .context = */ ctx
};
}
return sycl_backend;
}
+GGML_BACKEND_DL_IMPL(ggml_backend_sycl_reg)
if (Vulkan_FOUND)
message(STATUS "Vulkan found")
- add_library(ggml-vulkan
- ggml-vulkan.cpp
- ../../include/ggml-vulkan.h
- )
+ ggml_add_backend_library(ggml-vulkan
+ ggml-vulkan.cpp
+ ../../include/ggml-vulkan.h
+ )
- target_link_libraries(ggml-vulkan PRIVATE ggml-base Vulkan::Vulkan)
- target_include_directories(ggml-vulkan PRIVATE . .. ${CMAKE_CURRENT_BINARY_DIR})
+ target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan)
+ target_include_directories(ggml-vulkan PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
# Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
# Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
ggml_backend_reg_t ggml_backend_vk_reg() {
static ggml_backend_reg reg = {
- /* .iface = */ ggml_backend_vk_reg_i,
- /* .context = */ nullptr,
+ /* .api_version = */ GGML_BACKEND_API_VERSION,
+ /* .iface = */ ggml_backend_vk_reg_i,
+ /* .context = */ nullptr,
};
return ®
VK_LOG_DEBUG("END ggml_vk_check_results_1(" << tensor->name << ")");
}
#endif
+
+GGML_BACKEND_DL_IMPL(ggml_backend_vk_reg)
g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
g_logger_state.log_callback_user_data = user_data;
}
+
+void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
+ p->n_threads = n_threads;
+ p->prio = 0; // default priority (usually means normal or inherited)
+ p->poll = 50; // hybrid-polling enabled
+ p->strict_cpu = false; // no strict placement (all threads share same cpumask)
+ p->paused = false; // threads are ready to go
+ memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
+}
+
+struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
+ struct ggml_threadpool_params p;
+ ggml_threadpool_params_init(&p, n_threads);
+ return p;
+}
+
+bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
+ if (p0->n_threads != p1->n_threads ) return false;
+ if (p0->prio != p1->prio ) return false;
+ if (p0->poll != p1->poll ) return false;
+ if (p0->strict_cpu != p1->strict_cpu ) return false;
+ return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
+}
if (EMSCRIPTEN)
else()
- add_subdirectory(vdot)
+ if (NOT GGML_BACKEND_DL)
+ add_subdirectory(vdot)
+ endif()
endif()
mappings.reserve(files.size());
mmaps_used.reserve(files.size());
for (const auto & file : files) {
- std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
+ auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
+ auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
+ std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn()));
mmaps_used.emplace_back(mapping->size, 0);
if (mlock_mmaps) {
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
if (!dev) {
// FIXME: workaround for CPU backend buft having a NULL device
- dev = ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0);
+ dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
}
ggml_backend_dev_props props;
ggml_backend_dev_get_props(dev, &props);
int n_threads,
ggml_threadpool * threadpool) {
if (lctx.backend_cpu != nullptr) {
- ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
- ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
+ auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(lctx.backend_cpu));
+ auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
+ set_threadpool_fn(lctx.backend_cpu, threadpool);
}
// set the number of threads for all the backends
void llama_numa_init(enum ggml_numa_strategy numa) {
if (numa != GGML_NUMA_STRATEGY_DISABLED) {
- ggml_numa_init(numa);
+ auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+ GGML_ASSERT(dev && "CPU backend is not loaded");
+ auto * reg = ggml_backend_dev_backend_reg(dev);
+ auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
+ numa_init_fn(numa);
}
}
__func__, n_ctx_per_seq, hparams.n_ctx_train);
}
- ctx->abort_callback = params.abort_callback;
- ctx->abort_callback_data = params.abort_callback_data;
-
ctx->logits_all = params.logits_all;
// build worst-case graph for encoder if a model contains encoder
}
// add CPU backend
- ctx->backend_cpu = ggml_backend_cpu_init();
+ ctx->backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
if (ctx->backend_cpu == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
llama_free(ctx);
}
}
+ llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data);
+
if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
llama_free(ctx);
std::vector<ggml_backend_t> backend_ptrs;
for (auto & backend : ctx->backends) {
auto * buft = ggml_backend_get_default_buffer_type(backend.get());
- if (ggml_backend_is_cpu(backend.get()) && !model->devices.empty()) {
+ auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
+ if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) {
// use the host buffer of the first device CPU for faster transfer of the intermediate state
auto * dev = model->devices[0];
auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
// pipeline parallelism requires support for async compute and events in all devices
if (pipeline_parallel) {
for (auto & backend : ctx->backends) {
- if (ggml_backend_is_cpu(backend.get())) {
+ auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
+ if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
// ignore CPU backend
continue;
}
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
ctx->abort_callback = abort_callback;
ctx->abort_callback_data = abort_callback_data;
+
+ for (auto & backend : ctx->backends) {
+ auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
+ auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
+ if (set_abort_callback_fn) {
+ set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data);
+ }
+ }
}
void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
}
const char * llama_print_system_info(void) {
- ggml_cpu_init(); // some ARM features are detected at runtime
-
static std::string s;
- s = "";
- s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
- s += "AVX_VNNI = " + std::to_string(ggml_cpu_has_avx_vnni()) + " | ";
- s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
- s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
- s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
- s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
- s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
- s += "AMX_INT8 = " + std::to_string(ggml_cpu_has_amx_int8()) + " | ";
- s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
- s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
- s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
- s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
- s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
- s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
- s += "RISCV_VECT = " + std::to_string(ggml_cpu_has_riscv_v()) + " | ";
- s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
- s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
- s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
- s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
- s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
- s += "LLAMAFILE = " + std::to_string(ggml_cpu_has_llamafile()) + " | ";
+ for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
+ auto * reg = ggml_backend_reg_get(i);
+ auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
+ if (get_features_fn) {
+ ggml_backend_feature * features = get_features_fn(reg);
+ s += ggml_backend_reg_name(reg);
+ s += " : ";
+ for (; features->name; features++) {
+ s += features->name;
+ s += " = ";
+ s += features->value;
+ s += " | ";
+ }
+ }
+ }
return s.c_str();
}
# llama_target_and_test(test-double-float.cpp) # SLOW
llama_target_and_test(test-log.cpp)
llama_target_and_test(test-arg-parser.cpp)
-llama_target_and_test(test-quantize-fns.cpp)
-llama_target_and_test(test-quantize-perf.cpp)
llama_target_and_test(test-sampling.cpp)
llama_target_and_test(test-chat-template.cpp)
llama_target_and_test(test-grammar-parser.cpp)
llama_target_and_test(test-grammar-integration.cpp)
llama_target_and_test(test-llama-grammar.cpp)
-llama_target_and_test(test-barrier.cpp)
# llama_target_and_test(test-opt.cpp) # SLOW
llama_target_and_test(test-backend-ops.cpp)
-llama_target_and_test(test-rope.cpp)
-
llama_target_and_test(test-model-load-cancel.cpp LABEL "model")
llama_target_and_test(test-autorelease.cpp LABEL "model")
+if (NOT GGML_BACKEND_DL)
+ # these tests use the backends directly and cannot be built with dynamic loading
+ llama_target_and_test(test-barrier.cpp)
+ llama_target_and_test(test-quantize-fns.cpp)
+ llama_target_and_test(test-quantize-perf.cpp)
+ llama_target_and_test(test-rope.cpp)
+endif()
+
# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
#include <ggml.h>
-#include <ggml-cpu.h>
#include <ggml-alloc.h>
#include <ggml-backend.h>
#include <cstdint>
#include <cstring>
#include <cinttypes>
-#include <functional>
#include <memory>
#include <random>
#include <stdio.h>
// determine number of runs
int n_runs;
+ bool is_cpu = ggml_backend_dev_type(ggml_backend_get_device(backend)) == GGML_BACKEND_DEVICE_TYPE_CPU;
if (op_flops(out) > 0) {
// based on flops
const uint64_t GFLOP = 1000 * 1000 * 1000;
const uint64_t target_flops_cpu = 8ULL * GFLOP;
const uint64_t target_flops_gpu = 100ULL * GFLOP;
- uint64_t target_flops = ggml_backend_is_cpu(backend) ? target_flops_cpu : target_flops_gpu;
+ uint64_t target_flops = is_cpu ? target_flops_cpu : target_flops_gpu;
n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_flops / op_flops(out)) + 1;
} else {
// based on memory size
const size_t GB = 1ULL << 30;
const size_t target_size_cpu = 8 * GB;
const size_t target_size_gpu = 32 * GB;
- size_t target_size = ggml_backend_is_cpu(backend) ? target_size_cpu : target_size_gpu;
+ size_t target_size = is_cpu ? target_size_cpu : target_size_gpu;
n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
}
static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
if (mode == MODE_TEST) {
auto test_cases = make_test_cases_eval();
- ggml_backend_t backend_cpu = ggml_backend_cpu_init();
+ ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
+ if (backend_cpu == NULL) {
+ printf(" Failed to initialize CPU backend\n");
+ return false;
+ }
size_t n_ok = 0;
for (auto & test : test_cases) {
}
}
- // enumerate backends
+ // load and enumerate backends
+ ggml_backend_load_all();
+
printf("Testing %zu devices\n\n", ggml_backend_dev_count());
size_t n_ok = 0;
continue;
}
- ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
- GGML_ASSERT(backend != NULL);
-
- if (backend_filter == NULL && ggml_backend_is_cpu(backend) && mode != MODE_GRAD) {
+ if (backend_filter == NULL && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU && mode != MODE_GRAD) {
printf(" Skipping CPU backend\n");
- ggml_backend_free(backend);
n_ok++;
continue;
}
+ ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
+ GGML_ASSERT(backend != NULL);
+
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
if (ggml_backend_set_n_threads_fn) {